blob: 22a38e70941b8204425066a9a6390b42a95b3170 [file] [log] [blame]
Gian Marco Iodicec63b7222021-06-30 08:39:44 +00001/*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
26#include "activation_float_helpers.h"
27
28/** Get the pointer position at a certain offset in x and y direction.
29 *
30 * @param[in] ptr Pointer to the starting position of the buffer
31 * @param[in] x Relative X position
32 * @param[in] y Relative Y position
33 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
34 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
35 *
36 * @return a uchar
37 */
38inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
39{
40 return ptr + x * stride_x + y * stride_y;
41}
42
43#if(DILATION_X == 1 && DILATION_Y == 1)
44
45#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0, weights_row0) \
46 ({ \
47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
53 })
54
55#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0, weights_row0) \
56 ({ \
57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
69 })
70
71#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0, src1, weights_row0) \
72 ({ \
73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
79 })
80
81#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0, src1, weights_row0) \
82 ({ \
83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
95 })
96
97#else /* DILATION_X==1 && DILATION_Y==1 */
98
99#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
100 ({ \
101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
107 })
108
109#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
110 ({ \
111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
117 })
118
119#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
120 ({ \
121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
133 })
134
135#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
136 ({ \
137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
149 })
150
151#endif /* DILATION_X==1 && DILATION_Y==1 */
152
153#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
154#if defined(CONV_STRIDE_X)
155
156#if CONV_STRIDE_X == 1
157#define convolution1x3 convolution1x3_stride_1
158#elif CONV_STRIDE_X == 2
159#define convolution1x3 convolution1x3_stride_2
160#elif CONV_STRIDE_X == 3
161#define convolution1x3 convolution1x3_stride_3
162#else /* CONV_STRIDE_X */
163#error "Stride not supported"
164#endif /* CONV_STRIDE_X */
165
166/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
167 *
168 * @param[in] left_pixel Pointer to the left pixel.
169 * @param[in] left_coeff Weight of the left pixel
170 * @param[in] middle_coeff Weight of the middle pixel
171 * @param[in] right_coeff Weight of the right pixel
172 *
173 * @return a float2 containing 2 convoluted values.
174 */
175inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
179{
180#if(DILATION_X == 1 && DILATION_Y == 1)
181 float4 temp = vload4(0, (__global float *)left_pixel);
182
183 float2 left = CONVERT(temp.s01, float2);
184 float2 middle = CONVERT(temp.s12, float2);
185 float2 right = CONVERT(temp.s23, float2);
186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
187#else /* DILATION_X==1 && DILATION_Y==1 */
188 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
191#endif /* DILATION_X==1 && DILATION_Y==1 */
192}
193
194/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
195 *
196 * @param[in] left_pixel Pointer to the left pixel.
197 * @param[in] left_coeff Weight of the left pixel
198 * @param[in] middle_coeff Weight of the middle pixel
199 * @param[in] right_coeff Weight of the right pixel
200 *
201 * @return a float2 containing 2 convoluted values.
202 */
203inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
207{
208#if(DILATION_X == 1 && DILATION_Y == 1)
209 float4 temp0 = vload4(0, (__global float *)left_pixel);
210 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
211
212 float2 left = CONVERT(temp0.s02, float2);
213 float2 middle = CONVERT(temp0.s13, float2);
214 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
215
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
217#else /* DILATION_X==1 && DILATION_Y==1 */
218 __global float *left_pixel_float = (__global float *)left_pixel;
219
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
223
224#endif /* DILATION_X==1 && DILATION_Y==1 */
225}
226
227/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
228 *
229 * @param[in] left_pixel Pointer to the left pixel.
230 * @param[in] left_coeff Weight of the left pixel
231 * @param[in] middle_coeff Weight of the middle pixel
232 * @param[in] right_coeff Weight of the right pixel
233 *
234 * @return a float2 containing 2 convoluted values.
235 */
236inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
240{
241#if(DILATION_X == 1 && DILATION_Y == 1)
242 float4 temp0 = vload4(0, (__global float *)left_pixel);
243 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
244
245 float2 left = CONVERT(temp0.s03, float2);
246 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
248
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
250#else /* DILATION_X==1 && DILATION_Y==1 */
251 __global float *left_pixel_float = (__global float *)left_pixel;
252
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
256#endif /* DILATION_X==1 && DILATION_Y==1 */
257}
258
259/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
260 *
261 * Convolution matrix layout:
262 *
263 * [ mat0, mat1, mat2 ]\n
264 * [ mat3, mat4, mat5 ]\n
265 * [ mat6, mat7, mat8 ]\n
266 *
267 * @param[in] src A pointer to source Image structure
268 * @param[in] mat0 Coefficient from the convolution matrix
269 * @param[in] mat1 Coefficient from the convolution matrix
270 * @param[in] mat2 Coefficient from the convolution matrix
271 * @param[in] mat3 Coefficient from the convolution matrix
272 * @param[in] mat4 Coefficient from the convolution matrix
273 * @param[in] mat5 Coefficient from the convolution matrix
274 * @param[in] mat6 Coefficient from the convolution matrix
275 * @param[in] mat0 Coefficient from the convolution matrix
276 * @param[in] mat7 Coefficient from the convolution matrix
277 * @param[in] mat8 Coefficient from the convolution matrix
278 *
279 * @return a float2 containing 2 convoluted values.
280 */
281inline float2 convolution3x3(
282 __global const uchar *src,
283 unsigned int src_stride_y,
284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
290 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
293
294 return pixels;
295}
296
297/** This OpenCL kernel computes the depthwise convolution 3x3
298 *
299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
340{
341 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
342 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
343
344 float2 pixels = 0.0f;
345
346 // Extract channel and linearized batch indices
347 const int channel = get_global_id(2) % DST_CHANNELS;
348 const int batch = get_global_id(2) / DST_CHANNELS;
349 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
350
351 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
352
353 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
354 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
355
356 // Load the weights
357 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
360
361 pixels = convolution3x3(src_addr, src_stride_y,
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
365#if defined(HAS_BIAS)
366 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
367
368 float bias = *((__global float *)(vector_offset(&biases, channel)));
369
370 pixels += (float2)bias;
371#endif //defined(HAS_BIAS)
372
373 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
374}
375#endif //defined(CONV_STRIDE_X)
376
377#if(DILATION_X > 1 || DILATION_Y > 1)
378
379/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
380 *
381 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
383 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
384 * @param[in] y_offset Offset from the source tensor from which to start convolution
385 * @param[in] weights_addr Pointer from where to get weights
386 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
387 */
388inline float2 convolution_3x3_dilation_stridex1_stridey1_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
390{
391 // Load the weights
392 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
395
396 float2 pixels0 = 0.0f;
397
398 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
399 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401
402 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
403 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405
406 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
407 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409
410 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
411 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
412 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
413
414 return pixels0;
415}
416
417/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
418 *
419 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
420 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
422 * @param[in] y_offset Offset from the source tensor from which to start convolution
423 * @param[in] weights_addr Pointer from where to get weights
424 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
425 */
426inline float2 convolution_3x3_dilation_stridex2_stridey2_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
428{
429 // Load the weights
430 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
433
434 float2 pixels0 = 0.0f;
435
436 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
437 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439
440 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
441 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443
444 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
445 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447
448 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
449 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
450 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
451
452 return pixels0;
453}
454
455#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
456
457/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
458 * stride_x and stride_y are equal to 1
459 *
460 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
461 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
462 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
463 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
464 *
465 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
466 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
467 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
468 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
469 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
470 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
471 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
472 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
473 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
482 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
493 */
494__kernel void depthwise_convolution_3x3_stridex1_stridey1_f32(
495 TENSOR3D_DECLARATION(src),
496 TENSOR3D_DECLARATION(dst),
497 TENSOR3D_DECLARATION(weights)
498#if defined(HAS_BIAS)
499 ,
500 VECTOR_DECLARATION(biases)
501#endif //defined(HAS_BIAS)
502)
503{
504 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
505 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
506
507 float2 pixels0 = 0.0f;
508 float2 pixels1 = 0.0f;
509 float2 pixels2 = 0.0f;
510 float2 pixels3 = 0.0f;
511
512 // Extract channel and linearized batch indices
513 const int channel = get_global_id(2) % DST_CHANNELS;
514 const int batch = get_global_id(2) / DST_CHANNELS;
515 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
516 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
517 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
518 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
519
520#if(DILATION_X == 1 && DILATION_Y == 1)
521 // Load the weights
522 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
525
526 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
527 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
528 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
529 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
530 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
531 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
532 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
533
534 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00, weights_row0);
535 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10, weights_row1);
536 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20, weights_row2);
537 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src10, weights_row0);
538 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src20, weights_row1);
539 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src30, weights_row2);
540 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src20, weights_row0);
541 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src30, weights_row1);
542 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src40, weights_row2);
543 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src30, weights_row0);
544 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src40, weights_row1);
545 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src50, weights_row2);
546
547#else /* DILATION_X==1 && DILATION_Y==1 */
548
549 //3x3 Convolution of elements starting in 0th row
550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
551 //3x3 Convolution of elements starting in 1st row
552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
553 //3x3 Convolution of elements starting in 2nd row
554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
555 //3x3 Convolution of elements starting in 3rd row
556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
557
558#endif /* DILATION_X==1 && DILATION_Y==1 */
559
560#ifdef HAS_BIAS
561 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
562
563 float bias = *((__global float *)(vector_offset(&biases, channel)));
564
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
569#endif /* defined(HAS_BIAS) */
570
571 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
575}
576
577/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
578 * stride_x and stride_y are equal to 2
579 *
580 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
581 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
582 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
583 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
584 *
585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
614__kernel void depthwise_convolution_3x3_stridex2_stridey2_f32(
615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
624 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
625 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
626
627 float2 pixels0 = 0.0f;
628 float2 pixels1 = 0.0f;
629
630 // Extract channel and linearized batch indices
631 const int channel = get_global_id(2) % DST_CHANNELS;
632 const int batch = get_global_id(2) / DST_CHANNELS;
633 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
634 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
635 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
636 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
637
638#if(DILATION_X == 1 && DILATION_Y == 1)
639
640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
657 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src40, src41, weights_row2);
663
664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
668 //3x3 Convolution of elements starting in 2nd row
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
675 float bias = *((__global float *)(vector_offset(&biases, channel)));
676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
681 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
683}
684
685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
686
687#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
688#if defined(CONV_STRIDE_X)
689#if CONV_STRIDE_X == 1
690#define convolution1x3_f16 convolution1x3_stride_1_f16
691#elif CONV_STRIDE_X == 2
692#define convolution1x3_f16 convolution1x3_stride_2_f16
693#elif CONV_STRIDE_X == 3
694#define convolution1x3_f16 convolution1x3_stride_3_f16
695#else /* CONV_STRIDE_X */
696#error "Stride not supported"
697#endif /* CONV_STRIDE_X */
698
699#if(DILATION_X > 1 || DILATION_Y > 1)
700
701/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
702 *
703 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
704 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
705 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
706 * @param[in] y_offset Offset from the source tensor from which to start convolution
707 * @param[in] weights_addr Pointer from where to get weights
708 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
709 */
710inline half4 convolution_3x3_dilation_stridex1_stridey1_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
711 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
712{
713 // Load the weights
714 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
715 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
716 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
717
718 half4 pixels0 = 0.0f;
719
720 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
721 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
722 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
723
724 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
725 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
726 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
727
728 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
729 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
730 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
731
732 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
733 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
734 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
735
736 return pixels0;
737}
738
739/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
740 *
741 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
742 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
743 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
744 * @param[in] y_offset Offset from the source tensor from which to start convolution
745 * @param[in] weights_addr Pointer from where to get weights
746 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
747 */
748inline half4 convolution_3x3_dilation_stridex2_stridey2_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
749 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
750{
751 // Load the weights
752 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
753 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
754 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
755
756 half4 pixels0 = 0.0f;
757
758 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
759 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
760 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
761
762 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
763 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
764 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
765
766 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
767 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
768 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
769
770 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
771 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
772 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
773
774 return pixels0;
775}
776
777#endif // (DILATION_X > 1 && DILATION_Y > 1)
778
779/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
780 *
781 * @param[in] left_pixel Pointer to the left pixel.
782 * @param[in] left_coeff Weight of the left pixel
783 * @param[in] middle_coeff Weight of the middle pixel
784 * @param[in] right_coeff Weight of the right pixel
785 *
786 * @return a half4 containing 4 convoluted values.
787 */
788inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
789 const half left_coeff,
790 const half middle_coeff,
791 const half right_coeff)
792{
793#if(DILATION_X == 1 && DILATION_Y == 1)
794
795 half8 temp = vload8(0, (__global half *)left_pixel);
796
797 half4 left = CONVERT(temp.s0123, half4);
798 half4 middle = CONVERT(temp.s1234, half4);
799 half4 right = CONVERT(temp.s2345, half4);
800
801 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
802#else /* DILATION_X==1 && DILATION_Y==1 */
803 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
804 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
805 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
806
807#endif /* DILATION_X==1 && DILATION_Y==1 */
808}
809
810/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
811 *
812 * @param[in] left_pixel Pointer to the left pixel.
813 * @param[in] left_coeff Weight of the left pixel
814 * @param[in] middle_coeff Weight of the middle pixel
815 * @param[in] right_coeff Weight of the right pixel
816 *
817 * @return a half4 containing 4 convoluted values.
818 */
819inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
820 const half left_coeff,
821 const half middle_coeff,
822 const half right_coeff)
823{
824#if(DILATION_X == 1 && DILATION_Y == 1)
825
826 half8 temp0 = vload8(0, (__global half *)left_pixel);
827 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
828
829 half4 left = CONVERT(temp0.s0246, half4);
830 half4 middle = CONVERT(temp0.s1357, half4);
831 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
832
833 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
834#else /* DILATION_X==1 && DILATION_Y==1 */
835
836 __global half *left_pixel_float = (__global half *)left_pixel;
837
838 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
839 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
840 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
841
842#endif /* DILATION_X==1 && DILATION_Y==1 */
843}
844
845/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
846 *
847 * @param[in] left_pixel Pointer to the left pixel.
848 * @param[in] left_coeff Weight of the left pixel
849 * @param[in] middle_coeff Weight of the middle pixel
850 * @param[in] right_coeff Weight of the right pixel
851 *
852 * @return a half4 containing 4 convoluted values.
853 */
854inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
855 const half left_coeff,
856 const half middle_coeff,
857 const half right_coeff)
858{
859#if(DILATION_X == 1 && DILATION_Y == 1)
860
861 half16 temp0 = vload16(0, (__global half *)left_pixel);
862
863 half4 left = CONVERT(temp0.s0369, half4);
864 half4 middle = CONVERT(temp0.s147A, half4);
865 half4 right = CONVERT(temp0.s258B, half4);
866
867 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
868#else /* DILATION_X==1 && DILATION_Y==1 */
869
870 __global half *left_pixel_float = (__global half *)left_pixel;
871
872 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
873 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
874 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
875
876#endif /* DILATION_X==1 && DILATION_Y==1 */
877}
878
879/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
880 *
881 * Convolution matrix layout:
882 *
883 * [ mat0, mat1, mat2 ]\n
884 * [ mat3, mat4, mat5 ]\n
885 * [ mat6, mat7, mat8 ]\n
886 *
887 * @param[in] src A pointer to source Image structure
888 * @param[in] mat0 Coefficient from the convolution matrix
889 * @param[in] mat1 Coefficient from the convolution matrix
890 * @param[in] mat2 Coefficient from the convolution matrix
891 * @param[in] mat3 Coefficient from the convolution matrix
892 * @param[in] mat4 Coefficient from the convolution matrix
893 * @param[in] mat5 Coefficient from the convolution matrix
894 * @param[in] mat6 Coefficient from the convolution matrix
895 * @param[in] mat0 Coefficient from the convolution matrix
896 * @param[in] mat7 Coefficient from the convolution matrix
897 * @param[in] mat8 Coefficient from the convolution matrix
898 *
899 * @return a half4 containing 4 convoluted values.
900 */
901inline half4 convolution3x3_f16(
902 __global uchar *src, uint src_stride_y,
903 const half mat0, const half mat1, const half mat2,
904 const half mat3, const half mat4, const half mat5,
905 const half mat6, const half mat7, const half mat8)
906{
907 half4 pixels;
908
909 pixels = convolution1x3_f16(src, mat0, mat1, mat2);
910 pixels += convolution1x3_f16(src + DILATION_Y * src_stride_y, mat3, mat4, mat5);
911 pixels += convolution1x3_f16(src + DILATION_Y * 2 * src_stride_y, mat6, mat7, mat8);
912
913 return pixels;
914}
915
916#if defined(DEPTH_MULTIPLIER)
917
918/** This OpenCL kernel computes the depthwise convolution 3x3
919 *
920 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
921 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
922 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
923 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
924 *
925 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
926 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
927 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
928 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
929 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
930 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
931 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
932 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
933 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
934 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
935 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
936 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
937 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
938 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
939 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
940 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
941 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
942 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
943 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
944 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
945 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
946 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
947 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
948 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
949 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
950 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
951 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
952 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
953 */
954__kernel void depthwise_convolution_3x3_f16(
955 TENSOR3D_DECLARATION(src),
956 TENSOR3D_DECLARATION(dst),
957 TENSOR3D_DECLARATION(weights)
958#if defined(HAS_BIAS)
959 ,
960 VECTOR_DECLARATION(biases)
961#endif //defined(HAS_BIAS)
962)
963{
964 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
965 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
966#if defined(HAS_BIAS)
967 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
968#endif //defined(HAS_BIAS)
969
970 // Extract channel and linearized batch indices
971 const int channel = get_global_id(2) % DST_CHANNELS;
972 const int batch = get_global_id(2) / DST_CHANNELS;
973 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
974 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
975 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
976 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
977
978 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
979 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
980 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
981 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
982
983 half4 pixels = convolution3x3_f16(src_addr, src_stride_y, weights_values0.s0, weights_values0.s1, weights_values0.s2,
984 weights_values1.s0, weights_values1.s1, weights_values1.s2,
985 weights_values2.s0, weights_values2.s1, weights_values2.s2);
986#if defined(HAS_BIAS)
987 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
988#endif //defined(HAS_BIAS)
989
990 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
991}
992#endif // defined(DEPTH_MULTIPLIER)
993#endif // defined(CONV_STRIDE_X)
994
995/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
996 * when both stride_x and stride_y are equal to 1
997 *
998 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
999 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1000 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1001 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1002 *
1003 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1004 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1005 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1006 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1007 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1008 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1009 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1010 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1011 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1012 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1013 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1014 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1015 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1016 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1017 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1018 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1019 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1020 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1021 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1022 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1023 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1024 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1025 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1026 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1027 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1028 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1029 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1030 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1031 */
1032__kernel void depthwise_convolution_3x3_stridex1_stridey1_f16(
1033 TENSOR3D_DECLARATION(src),
1034 TENSOR3D_DECLARATION(dst),
1035 TENSOR3D_DECLARATION(weights)
1036#if defined(HAS_BIAS)
1037 ,
1038 VECTOR_DECLARATION(biases)
1039#endif //defined(HAS_BIAS)
1040)
1041{
1042 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
1043 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1044
1045 // Extract channel and linearized batch indices
1046 const int channel = get_global_id(2) % DST_CHANNELS;
1047 const int batch = get_global_id(2) / DST_CHANNELS;
1048
1049#ifdef HAS_BIAS
1050 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1051
1052 half bias = *((__global half *)(vector_offset(&biases, channel)));
1053#endif /* defined(HAS_BIAS) */
1054
1055 half4 pixels0 = 0.0f;
1056 half4 pixels1 = 0.0f;
1057 half4 pixels2 = 0.0f;
1058 half4 pixels3 = 0.0f;
1059
1060 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1061 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1062 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1063 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1064
1065#if(DILATION_X == 1 && DILATION_Y == 1)
1066 // Load the weights
1067 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1068 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1069 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1070
1071 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1072 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1073 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1074 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1075 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1076 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1077 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1078
1079 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00, weights_row0);
1080 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10, weights_row1);
1081 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20, weights_row2);
1082 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src10, weights_row0);
1083 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src20, weights_row1);
1084 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src30, weights_row2);
1085 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src20, weights_row0);
1086 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src30, weights_row1);
1087 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src40, weights_row2);
1088 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src30, weights_row0);
1089 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src40, weights_row1);
1090 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src50, weights_row2);
1091
1092#else /* DILATION_X==1 && DILATION_Y==1 */
1093
1094 //3x3 Convolution of elements starting in 0th row
1095 pixels0 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
1096 //3x3 Convolution of elements starting in 1st row
1097 pixels1 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
1098 //3x3 Convolution of elements starting in 2nd row
1099 pixels2 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
1100 //3x3 Convolution of elements starting in 3rd row
1101 pixels3 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
1102
1103#endif /* DILATION_X==1 && DILATION_Y==1 */
1104
1105#ifdef HAS_BIAS
1106 pixels0 += (half4)bias;
1107 pixels1 += (half4)bias;
1108 pixels2 += (half4)bias;
1109 pixels3 += (half4)bias;
1110#endif /* defined(HAS_BIAS) */
1111
1112 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1113 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1114 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1115 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
1116}
1117
1118/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1119 * when both stride_x and stride_y are equal to 2
1120 *
1121 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1122 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1123 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1124 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1125 *
1126 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1127 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1128 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1129 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1130 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1131 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1132 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1133 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1134 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1135 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1136 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1137 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1138 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1139 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1140 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1141 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1142 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1143 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1144 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1145 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1146 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1147 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1148 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1149 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1150 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1151 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1152 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1153 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1154 */
1155__kernel void depthwise_convolution_3x3_stridex2_stridey2_f16(
1156 TENSOR3D_DECLARATION(src),
1157 TENSOR3D_DECLARATION(dst),
1158 TENSOR3D_DECLARATION(weights)
1159#if defined(HAS_BIAS)
1160 ,
1161 VECTOR_DECLARATION(biases)
1162#endif //defined(HAS_BIAS)
1163)
1164{
1165 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
1166 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1167
1168 // Extract channel and linearized batch indices
1169 const int channel = get_global_id(2) % DST_CHANNELS;
1170 const int batch = get_global_id(2) / DST_CHANNELS;
1171
1172#ifdef HAS_BIAS
1173 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1174
1175 half bias = *((__global half *)(vector_offset(&biases, channel)));
1176#endif /* defined(HAS_BIAS) */
1177
1178 half4 pixels0 = 0.0f;
1179 half4 pixels1 = 0.0f;
1180
1181 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1182 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1183 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1184 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1185
1186#if(DILATION_X == 1 && DILATION_Y == 1)
1187
1188 // Load the weights
1189 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1190 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1191 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1192
1193 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1194 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1195 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1196 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1197 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1198 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1199 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1200 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1201 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1202 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1203 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1204
1205 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1206 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1207 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1208 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1209 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1210 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1211
1212#else /* DILATION_X==1 && DILATION_Y==1 */
1213 //3x3 Convolution of elements starting in 0th row
1214 pixels0 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
1215 //3x3 Convolution of elements starting in 2nd row
1216 pixels1 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
1217#endif /* DILATION_X==1 && DILATION_Y==1 */
1218
1219#ifdef HAS_BIAS
1220 pixels0 += (half4)bias;
1221 pixels1 += (half4)bias;
1222#endif /* defined(HAS_BIAS) */
1223
1224 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1225 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1226}
1227#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
1228
1229#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
1230/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1231 *
1232 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1233 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1234 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1235 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1236 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1237 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1238 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1239 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1240 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1241 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1242 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1243 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
1244 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1245 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1246 *
1247 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1248 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1249 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1250 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1251 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1252 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1253 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1254 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1255 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1256 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1257 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1258 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1259 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1260 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1261 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1262 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1263 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1264 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1265 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1266 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1267 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1268 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1269 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1270 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1271 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1272 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1273 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1274 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1275 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1276 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1277 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1278 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1279 */
1280__kernel void dwc_MxN_native_fp_nhwc(
1281 TENSOR4D_DECLARATION(src),
1282 TENSOR4D_DECLARATION(dst),
1283 TENSOR3D_DECLARATION(weights)
1284#if defined(HAS_BIAS)
1285 ,
1286 VECTOR_DECLARATION(biases)
1287#endif // defined(HAS_BIAS)
1288)
1289{
1290 int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);
1291
1292 int x = get_global_id(0); // channels
1293 int y = get_global_id(1); // spatial coordinate x
1294#if defined(DST_DEPTH)
1295 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1296 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1297#else // defined(DST_DEPTH)
1298 int z = get_global_id(2); // spatial coordinate y
1299#endif // defined(DST_DEPTH)
1300
1301 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
1302
1303 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
1304
1305 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
1306
1307#if defined(HAS_BIAS)
1308 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
1309#endif // defined(HAS_BIAS)
1310
1311#if defined(DST_DEPTH)
1312 s_addr += b * src_stride_w;
1313 d_addr += b * dst_stride_w;
1314#endif // defined(DST_DEPTH)
1315
1316 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1317 {
1318 // Each work-item computes N0x1x1 elements
1319 VEC_DATA_TYPE(DATA_TYPE, N0)
1320 res0 = 0;
1321
1322 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1323 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1324
1325 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1326 {
1327 if(y_coord >= 0 && y_coord < SRC_DIM2)
1328 {
1329 int x_coord_tmp = x_coord;
1330
1331 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1332 {
1333 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1334 {
1335 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1336 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1337
1338 // Load input and weights values
1339 VEC_DATA_TYPE(DATA_TYPE, N0)
1340 i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
1341 VEC_DATA_TYPE(DATA_TYPE, N0)
1342 w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
1343
1344#if GPU_ARCH == GPU_ARCH_MIDGARD
1345 res0 += i * w;
1346#else // GPU_ARCH == GPU_ARCH_MIDGARD
1347 res0 = fma(i, w, res0);
1348#endif // GPU_ARCH == GPU_ARCH_MIDGARD
1349 }
1350 x_coord_tmp += DILATION_X;
1351 }
1352 }
1353 y_coord += DILATION_Y;
1354 }
1355
1356#if defined(HAS_BIAS)
1357 res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
1358#endif // defined(HAS_BIAS)
1359
1360 res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);
1361
1362 STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1363
1364 w_addr += sizeof(DATA_TYPE);
1365 d_addr += sizeof(DATA_TYPE);
1366#if defined(HAS_BIAS)
1367 b_addr += sizeof(DATA_TYPE);
1368#endif // defined(HAS_BIAS)
1369 }
1370}
1371#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
1372
1373#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
1374
1375#if DATA_TYPE != float || DATA_TYPE != half
1376#error "Unsupported data type"
1377#endif // DATA_TYPE != float || DATA_TYPE != half
1378
1379#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
1380
1381#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \
1382 ({ \
1383 basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
1384 basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
1385 basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
1386 })
1387
1388#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \
1389 ({ \
1390 FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \
1391 basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
1392 })
1393
1394#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1395
1396/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1397 *
1398 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1399 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1400 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1401 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1402 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1403 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1404 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1405 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
1406 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1407 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1408 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1409 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1410 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1411 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
1412 *
1413 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1414 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1415 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1416 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1417 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1418 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1419 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1420 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1421 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1422 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1423 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1424 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1425 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1426 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1427 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1428 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1429 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1430 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1431 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1432 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1433 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1434 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1435 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1436 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1437 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1438 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1439 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1440 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1441 * @param[in] max_offset Max offset for the input tensor
1442 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1443 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1444 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1445 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1446 */
1447__kernel void depthwise_convolution_3x3_nhwc(
1448 TENSOR4D_DECLARATION(src),
1449 TENSOR4D_DECLARATION(dst),
1450 TENSOR3D_DECLARATION(weights)
1451#if defined(HAS_BIAS)
1452 ,
1453 VECTOR_DECLARATION(biases)
1454#endif /* defined(HAS_BIAS) */
1455)
1456{
1457 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1458 int y = get_global_id(1); // spatial coordinate x
1459#if defined(DST_DEPTH)
1460 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1461 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1462#else // defined(DST_DEPTH)
1463 int z = get_global_id(2); // spatial coordinate y
1464#endif // defined(DST_DEPTH)
1465
1466 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
1467
1468#if defined(DST_DEPTH)
1469 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
1470#else /* defined(DST_DEPTH) */
1471 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
1472#endif /* defined(DST_DEPTH) */
1473
1474 int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
1475 int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
1476
1477 int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
1478 int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
1479
1480 // Use these vectors to check whether the unclamped load would have been out of bounds
1481 src_coord_y = (src_offset_y != src_coord_y);
1482 src_coord_z = (src_offset_z != src_coord_z);
1483
1484 src_offset_y *= (int3)src_stride_y;
1485 src_offset_z *= (int3)src_stride_z;
1486
1487 // We compute VEC_SIZEx1x1 [C,W,H] elements
1488 VEC_FLOAT acc0 = 0;
1489
1490 // Load weights
1491 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1492 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1493 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1494 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1495 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1496 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1497 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1498 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1499 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
1500
1501 // Load input values
1502 // z == 0
1503 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1504 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1505 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
1506
1507 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
1508
1509 acc0 = fma(values0, w0, acc0);
1510 acc0 = fma(values1, w1, acc0);
1511 acc0 = fma(values2, w2, acc0);
1512
1513 // z == 1
1514 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1515 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1516 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1517
1518 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
1519
1520 acc0 = fma(values0, w3, acc0);
1521 acc0 = fma(values1, w4, acc0);
1522 acc0 = fma(values2, w5, acc0);
1523
1524 // z == 2
1525 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1526 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1527 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1528
1529 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
1530
1531 acc0 = fma(values0, w6, acc0);
1532 acc0 = fma(values1, w7, acc0);
1533 acc0 = fma(values2, w8, acc0);
1534
1535#if defined(HAS_BIAS)
1536 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1537 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
1538 acc0 += bias_values;
1539#endif // defined(HAS_BIAS)
1540
1541#if defined(DST_DEPTH)
1542 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1543#else /* defined(DST_DEPTH) */
1544 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
1545#endif /* defined(DST_DEPTH) */
1546
1547 acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1548 STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1549}
1550#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1551
1552#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1553/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1554 *
1555 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1556 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1557 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1558 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1559 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1560 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1561 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1562 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1563 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1564 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1565 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1566 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1567 * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
1568 * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
1569 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1570 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
1571 *
1572 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1573 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1574 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1575 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1576 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1577 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1578 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1579 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1580 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1581 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1582 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1583 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1584 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1585 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1586 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1587 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1588 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1589 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1590 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1591 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1592 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1593 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1594 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1595 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1596 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1597 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1598 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1599 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1600 * @param[in] max_offset Max offset for the input tensor
1601 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1602 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1603 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1604 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1605 */
1606__kernel void depthwise_convolution_3x3_nhwc_stride1(
1607 TENSOR4D_DECLARATION(src),
1608 TENSOR4D_DECLARATION(dst),
1609 TENSOR3D_DECLARATION(weights)
1610#if defined(HAS_BIAS)
1611 ,
1612 VECTOR_DECLARATION(biases)
1613#endif /* defined(HAS_BIAS) */
1614)
1615{
1616 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1617 int y = get_global_id(1); // spatial coordinate x
1618#if defined(DST_DEPTH)
1619 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1620 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1621#else // defined(DST_DEPTH)
1622 int z = get_global_id(2); // spatial coordinate y
1623#endif // defined(DST_DEPTH)
1624
1625 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
1626
1627#if defined(DST_DEPTH)
1628 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
1629#else /* defined(DST_DEPTH) */
1630 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
1631#endif /* defined(DST_DEPTH) */
1632
1633 int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
1634 int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
1635
1636 int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
1637 int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
1638
1639 // Use these vectors to check whether the unclamped load would have been out of bounds
1640 src_coord_y = (src_offset_y != src_coord_y);
1641 src_coord_z = (src_offset_z != src_coord_z);
1642
1643 src_offset_y *= (int4)src_stride_y;
1644 src_offset_z *= (int4)src_stride_z;
1645
1646 // We compute VEC_SIZEx2x2 [C,W,H] elements
1647 VEC_FLOAT acc0 = 0;
1648 VEC_FLOAT acc1 = 0;
1649 VEC_FLOAT acc2 = 0;
1650 VEC_FLOAT acc3 = 0;
1651
1652 // Load weights
1653 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1654 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1655 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1656 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1657 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1658 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1659 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1660 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1661 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
1662
1663 // Load input values
1664 // z == 0
1665 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1666 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1667 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
1668 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
1669
1670 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
1671
1672 acc0 = fma(values0, w0, acc0);
1673 acc0 = fma(values1, w1, acc0);
1674 acc0 = fma(values2, w2, acc0);
1675 acc1 = fma(values1, w0, acc1);
1676 acc1 = fma(values2, w1, acc1);
1677 acc1 = fma(values3, w2, acc1);
1678
1679 // z == 1
1680 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1681 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1682 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1683 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
1684
1685 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
1686
1687 acc0 = fma(values0, w3, acc0);
1688 acc0 = fma(values1, w4, acc0);
1689 acc0 = fma(values2, w5, acc0);
1690 acc1 = fma(values1, w3, acc1);
1691 acc1 = fma(values2, w4, acc1);
1692 acc1 = fma(values3, w5, acc1);
1693
1694 acc2 = fma(values0, w0, acc2);
1695 acc2 = fma(values1, w1, acc2);
1696 acc2 = fma(values2, w2, acc2);
1697 acc3 = fma(values1, w0, acc3);
1698 acc3 = fma(values2, w1, acc3);
1699 acc3 = fma(values3, w2, acc3);
1700
1701 // z == 2
1702 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1703 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1704 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1705 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
1706
1707 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
1708
1709 acc0 = fma(values0, w6, acc0);
1710 acc0 = fma(values1, w7, acc0);
1711 acc0 = fma(values2, w8, acc0);
1712 acc1 = fma(values1, w6, acc1);
1713 acc1 = fma(values2, w7, acc1);
1714 acc1 = fma(values3, w8, acc1);
1715
1716 acc2 = fma(values0, w3, acc2);
1717 acc2 = fma(values1, w4, acc2);
1718 acc2 = fma(values2, w5, acc2);
1719 acc3 = fma(values1, w3, acc3);
1720 acc3 = fma(values2, w4, acc3);
1721 acc3 = fma(values3, w5, acc3);
1722
1723 // z == 3
1724 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
1725 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
1726 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
1727 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
1728
1729 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
1730
1731 acc2 = fma(values0, w6, acc2);
1732 acc2 = fma(values1, w7, acc2);
1733 acc2 = fma(values2, w8, acc2);
1734 acc3 = fma(values1, w6, acc3);
1735 acc3 = fma(values2, w7, acc3);
1736 acc3 = fma(values3, w8, acc3);
1737
1738#if defined(HAS_BIAS)
1739 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1740
1741 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
1742
1743 acc0 += bias_values;
1744 acc1 += bias_values;
1745 acc2 += bias_values;
1746 acc3 += bias_values;
1747#endif // defined(HAS_BIAS)
1748
1749 int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
1750 int dst_coord_z = z * NUM_PLANES_PROCESSED;
1751
1752#if defined(DST_DEPTH)
1753 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
1754#else // defined(DST_DEPTH)
1755 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
1756#endif // defined(DST_DEPTH)
1757
1758 /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
1759 * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
1760 * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
1761 */
1762 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
1763 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1764
1765 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1766 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1767
1768#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1769 if((dst_coord_z + 1) < DST_DIM_2)
1770#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1771 {
1772 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
1773 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1774
1775 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
1776 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1777 }
1778}
1779
1780#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1781#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)