blob: ac94b693e376b3b8b41107b87fdb3f8440ea042f [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Giorgio Arena9fe41442017-08-23 16:36:24 +010027#if defined(CONV_STRIDE_X)
28
Giorgio Arena93a690e2017-08-01 16:09:33 +010029#if CONV_STRIDE_X == 1
30#define convolution1x3 convolution1x3_stride_1
31#elif CONV_STRIDE_X == 2
32#define convolution1x3 convolution1x3_stride_2
33#elif CONV_STRIDE_X == 3
34#define convolution1x3 convolution1x3_stride_3
35#else /* CONV_STRIDE_X */
36#error "Stride not supported"
37#endif /* CONV_STRIDE_X */
38
39/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
40 *
41 * @param[in] left_pixel Pointer to the left pixel.
42 * @param[in] left_coeff Weight of the left pixel
43 * @param[in] middle_coeff Weight of the middle pixel
44 * @param[in] right_coeff Weight of the right pixel
45 *
46 * @return a float2 containing 2 convoluted values.
47 */
48inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
49 const float left_coeff,
50 const float middle_coeff,
51 const float right_coeff)
52{
53 float4 temp = vload4(0, (__global float *)left_pixel);
54
55 float2 left = CONVERT(temp.s01, float2);
56 float2 middle = CONVERT(temp.s12, float2);
57 float2 right = CONVERT(temp.s23, float2);
58
59 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
60}
61
62/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
63 *
64 * @param[in] left_pixel Pointer to the left pixel.
65 * @param[in] left_coeff Weight of the left pixel
66 * @param[in] middle_coeff Weight of the middle pixel
67 * @param[in] right_coeff Weight of the right pixel
68 *
69 * @return a float2 containing 2 convoluted values.
70 */
71inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
72 const float left_coeff,
73 const float middle_coeff,
74 const float right_coeff)
75{
76 float4 temp0 = vload4(0, (__global float *)left_pixel);
77 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
78
79 float2 left = CONVERT(temp0.s02, float2);
80 float2 middle = CONVERT(temp0.s13, float2);
81 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
82
83 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
84}
85
86/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
87 *
88 * @param[in] left_pixel Pointer to the left pixel.
89 * @param[in] left_coeff Weight of the left pixel
90 * @param[in] middle_coeff Weight of the middle pixel
91 * @param[in] right_coeff Weight of the right pixel
92 *
93 * @return a float2 containing 2 convoluted values.
94 */
95inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
96 const float left_coeff,
97 const float middle_coeff,
98 const float right_coeff)
99{
100 float4 temp0 = vload4(0, (__global float *)left_pixel);
101 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
102
103 float2 left = CONVERT(temp0.s03, float2);
104 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
105 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
106
107 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
108}
109
110/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
111 *
112 * Convolution matrix layout:
113 *
114 * [ mat0, mat1, mat2 ]\n
115 * [ mat3, mat4, mat5 ]\n
116 * [ mat6, mat7, mat8 ]\n
117 *
118 * @param[in] src A pointer to source Image structure
119 * @param[in] mat0 Coefficient from the convolution matrix
120 * @param[in] mat1 Coefficient from the convolution matrix
121 * @param[in] mat2 Coefficient from the convolution matrix
122 * @param[in] mat3 Coefficient from the convolution matrix
123 * @param[in] mat4 Coefficient from the convolution matrix
124 * @param[in] mat5 Coefficient from the convolution matrix
125 * @param[in] mat6 Coefficient from the convolution matrix
126 * @param[in] mat0 Coefficient from the convolution matrix
127 * @param[in] mat7 Coefficient from the convolution matrix
128 * @param[in] mat8 Coefficient from the convolution matrix
129 *
130 * @return a float2 containing 2 convoluted values.
131 */
132inline float2 convolution3x3(
133 Image *src,
134 const float mat0, const float mat1, const float mat2,
135 const float mat3, const float mat4, const float mat5,
136 const float mat6, const float mat7, const float mat8)
137{
138 float2 pixels;
139
140 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
141 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
142 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
143
144 return pixels;
145}
146
Gian Marcoc799ed82018-02-01 16:57:48 +0000147/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000148 *
Gian Marcoc799ed82018-02-01 16:57:48 +0000149 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000150 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
151 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
152 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
153 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
154 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
155 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
156 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000157 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000158 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
159 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
160 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
161 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
162 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
163 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
164 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000165 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000166 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
167 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
169 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
171 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
172 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
173 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
174 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
175 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
176 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
177 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100178__kernel void depthwise_convolution_3x3(
179 TENSOR3D_DECLARATION(src),
180 TENSOR3D_DECLARATION(dst),
181 TENSOR3D_DECLARATION(weights)
182#if defined(HAS_BIAS)
183 ,
184 VECTOR_DECLARATION(biases)
185#endif //defined(HAS_BIAS)
186)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100187{
188 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
189 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
190 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100191#if defined(HAS_BIAS)
192 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
193#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100194
195 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
196 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
197 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
198 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
199
200 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
201 weights_values1.s0, weights_values1.s1, weights_values1.s2,
202 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100203#if defined(HAS_BIAS)
204 pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
205#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100206
207 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100208}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100209#endif //defined(CONV_STRIDE_X)
210
Gian Marcoc799ed82018-02-01 16:57:48 +0000211#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
212 ({ \
213 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
214 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
215 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
216 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
217 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
218 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
219 })
220
221#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
222 ({ \
223 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
224 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
225 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
226 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
227 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
228 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
229 })
230
231/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
232 * stride_x and stride_y are equal to 1
233 *
234 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
235 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
236 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
237 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
238 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
239 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
240 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
241 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
242 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
243 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
244 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
245 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
246 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
247 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
248 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
249 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
250 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
251 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
252 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
253 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
254 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
255 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
256 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
257 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
258 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
259 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
260 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
261 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
262 */
263__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost(
264 TENSOR3D_DECLARATION(src),
265 TENSOR3D_DECLARATION(dst),
266 TENSOR3D_DECLARATION(weights)
267#if defined(HAS_BIAS)
268 ,
269 VECTOR_DECLARATION(biases)
270#endif //defined(HAS_BIAS)
271)
272{
273 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
274 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
275 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
276
277 float2 pixels0 = 0.0f;
278 float2 pixels1 = 0.0f;
279 float2 pixels2 = 0.0f;
280 float2 pixels3 = 0.0f;
281
282 __global uchar *weights_addr = (__global uchar *)weights.ptr;
283 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
284
285 // Load the weights
286 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
287 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
288 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
289
290 // Note: Since each work-item computes 4x2 elements, we need to load 4 rows from the input tensor
291 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
292 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
293 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
294 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
295 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row3
296 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row3
297
298 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
299 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
300 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
301 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
302 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
303 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
304 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
305 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
306 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
307 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
308 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
309 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
310
311#ifdef HAS_BIAS
312 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
313
314 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
315
316 pixels0 += (float2)bias;
317 pixels1 += (float2)bias;
318 pixels2 += (float2)bias;
319 pixels3 += (float2)bias;
320#endif /* defined(HAS_BIAS) */
321
322 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
323 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
324 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
325 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
326}
327
328/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
329 * stride_x and stride_y are equal to 2
330 *
331 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
332 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
333 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
334 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
335 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
336 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
337 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
338 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
339 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
340 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
341 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
342 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
343 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
344 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
345 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
346 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
347 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
348 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
349 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
350 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
351 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
352 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
353 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
354 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
355 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
356 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
357 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
358 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
359 */
360__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost(
361 TENSOR3D_DECLARATION(src),
362 TENSOR3D_DECLARATION(dst),
363 TENSOR3D_DECLARATION(weights)
364#if defined(HAS_BIAS)
365 ,
366 VECTOR_DECLARATION(biases)
367#endif //defined(HAS_BIAS)
368)
369{
370 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
371 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
372 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
373
374 float2 pixels0 = 0.0f;
375 float2 pixels1 = 0.0f;
376
377 __global uchar *weights_addr = (__global uchar *)weights.ptr;
378 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
379
380 // Load the weights
381 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
382 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
383 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
384
385 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
386 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
387 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
388 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
389 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
390 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
391 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
392 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
393 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
394 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
395 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
396
397 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
398 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
399 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
400 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
401 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
402 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
403
404#ifdef HAS_BIAS
405 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
406
407 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
408
409 pixels0 += (float2)bias;
410 pixels1 += (float2)bias;
411#endif /* defined(HAS_BIAS) */
412
413 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
414 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
415}
416
Giorgio Arena9fe41442017-08-23 16:36:24 +0100417#if defined(SRC_WIDTH) && defined(DATA_TYPE)
418/** This kernel reshapes each of the tensor's low three dimensions to single rows.
419 *
420 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
421 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100422 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
423 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
424 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
425 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
426 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
427 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
428 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
429 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
430 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
431 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
432 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
433 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
434 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
435 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
436 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
437 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
438 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
439 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100440 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100441__kernel void depthwise_weights_reshape(
442 TENSOR3D_DECLARATION(src),
443 IMAGE_DECLARATION(dst)
444#ifdef HAS_BIAS
445 ,
446 VECTOR_DECLARATION(biases)
447#endif /* HAS_BIAS */
448)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100449{
450 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100451#ifdef HAS_BIAS
452 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
453#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100454
455 __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
456 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
457
458 for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
459 {
460 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
461 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100462
463#if defined(HAS_BIAS)
464 if(get_global_id(1) == 0)
465 {
466 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
467 }
468#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100469}
470#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
471
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100472#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100473/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
474 *
475 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100476 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
Giorgio Arena9fe41442017-08-23 16:36:24 +0100477 *
478 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
479 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
480 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
481 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
482 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
483 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
484 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
485 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
486 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
487 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
488 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
489 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
490 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
491 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
492 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
493 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
494 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100495__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
496{
497 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
498
499 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100500 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100501 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
502
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100503 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
504 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100505 const int src_z = get_global_id(2);
506
507 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
508 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
509
510 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
511 {
512 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
513 {
514 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
515 {
516 *output_ptr = 0;
517 }
518 else
519 {
520 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
521 }
522 }
523 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100524#if defined(HAS_BIAS)
525 *output_ptr = (DATA_TYPE)(1);
526#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100527}
528
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100529#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100530
531#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
532
533/** This kernel performs a reshaping of the output of the depthwise generic convolution.
534 *
535 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
536 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
537 *
538 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
539 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
540 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
541 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
542 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
543 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
544 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
545 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
546 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
547 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
548 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
549 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
550 */
551__kernel void depthwise_vector_to_tensor(
552 VECTOR_DECLARATION(src),
553 TENSOR3D_DECLARATION(dst))
554{
555 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
556
557 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
558 const int id0 = get_global_id(0);
559 const int z = id0 / patch_size;
560 const int index2D = id0 - z * patch_size;
561
562 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
563 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
564}
565
566#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)