blob: da22faabedf28796f944eb6182f2a74b3ec1ba71 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Michele Di Giorgiof6f78762020-07-06 11:27:21 +01002 * Copyright (c) 2017-2020 Arm Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010024#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010027
28/** Get the pointer position at a certain offset in x and y direction.
29 *
30 * @param[in] ptr Pointer to the starting position of the buffer
31 * @param[in] x Relative X position
32 * @param[in] y Relative Y position
33 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
34 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
35 *
36 * @return a uchar
37 */
38inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
39{
40 return ptr + x * stride_x + y * stride_y;
41}
42
43#if(DILATION_X == 1 && DILATION_Y == 1)
44
45#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
46 ({ \
47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
53 })
54
55#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
56 ({ \
57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
69 })
70
71#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
72 ({ \
73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
79 })
80
81#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
82 ({ \
83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
95 })
96
97#else /* DILATION_X==1 && DILATION_Y==1 */
98
99#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
100 ({ \
101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
107 })
108
109#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
110 ({ \
111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
117 })
118
119#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
120 ({ \
121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
133 })
134
135#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
136 ({ \
137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
149 })
150
151#endif /* DILATION_X==1 && DILATION_Y==1 */
152
153#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100154#if defined(CONV_STRIDE_X)
155
Giorgio Arena93a690e2017-08-01 16:09:33 +0100156#if CONV_STRIDE_X == 1
157#define convolution1x3 convolution1x3_stride_1
158#elif CONV_STRIDE_X == 2
159#define convolution1x3 convolution1x3_stride_2
160#elif CONV_STRIDE_X == 3
161#define convolution1x3 convolution1x3_stride_3
162#else /* CONV_STRIDE_X */
163#error "Stride not supported"
164#endif /* CONV_STRIDE_X */
165
166/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
167 *
168 * @param[in] left_pixel Pointer to the left pixel.
169 * @param[in] left_coeff Weight of the left pixel
170 * @param[in] middle_coeff Weight of the middle pixel
171 * @param[in] right_coeff Weight of the right pixel
172 *
173 * @return a float2 containing 2 convoluted values.
174 */
175inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
179{
Usama Arife73686a2019-04-08 17:30:48 +0100180#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100181 float4 temp = vload4(0, (__global float *)left_pixel);
182
183 float2 left = CONVERT(temp.s01, float2);
184 float2 middle = CONVERT(temp.s12, float2);
185 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100187#else /* DILATION_X==1 && DILATION_Y==1 */
188 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
191#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100192}
193
194/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
195 *
196 * @param[in] left_pixel Pointer to the left pixel.
197 * @param[in] left_coeff Weight of the left pixel
198 * @param[in] middle_coeff Weight of the middle pixel
199 * @param[in] right_coeff Weight of the right pixel
200 *
201 * @return a float2 containing 2 convoluted values.
202 */
203inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
207{
Usama Arife73686a2019-04-08 17:30:48 +0100208#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209 float4 temp0 = vload4(0, (__global float *)left_pixel);
210 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
211
212 float2 left = CONVERT(temp0.s02, float2);
213 float2 middle = CONVERT(temp0.s13, float2);
214 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
215
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100217#else /* DILATION_X==1 && DILATION_Y==1 */
218 __global float *left_pixel_float = (__global float *)left_pixel;
219
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
223
224#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100225}
226
227/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
228 *
229 * @param[in] left_pixel Pointer to the left pixel.
230 * @param[in] left_coeff Weight of the left pixel
231 * @param[in] middle_coeff Weight of the middle pixel
232 * @param[in] right_coeff Weight of the right pixel
233 *
234 * @return a float2 containing 2 convoluted values.
235 */
236inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
240{
Usama Arife73686a2019-04-08 17:30:48 +0100241#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100242 float4 temp0 = vload4(0, (__global float *)left_pixel);
243 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
244
245 float2 left = CONVERT(temp0.s03, float2);
246 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
248
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100250#else /* DILATION_X==1 && DILATION_Y==1 */
251 __global float *left_pixel_float = (__global float *)left_pixel;
252
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
256#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100257}
258
259/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
260 *
261 * Convolution matrix layout:
262 *
263 * [ mat0, mat1, mat2 ]\n
264 * [ mat3, mat4, mat5 ]\n
265 * [ mat6, mat7, mat8 ]\n
266 *
267 * @param[in] src A pointer to source Image structure
268 * @param[in] mat0 Coefficient from the convolution matrix
269 * @param[in] mat1 Coefficient from the convolution matrix
270 * @param[in] mat2 Coefficient from the convolution matrix
271 * @param[in] mat3 Coefficient from the convolution matrix
272 * @param[in] mat4 Coefficient from the convolution matrix
273 * @param[in] mat5 Coefficient from the convolution matrix
274 * @param[in] mat6 Coefficient from the convolution matrix
275 * @param[in] mat0 Coefficient from the convolution matrix
276 * @param[in] mat7 Coefficient from the convolution matrix
277 * @param[in] mat8 Coefficient from the convolution matrix
278 *
279 * @return a float2 containing 2 convoluted values.
280 */
281inline float2 convolution3x3(
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100282 __global const uchar *src,
283 unsigned int src_stride_y,
Giorgio Arena93a690e2017-08-01 16:09:33 +0100284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100290 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100293
294 return pixels;
295}
296
Gian Marcoc799ed82018-02-01 16:57:48 +0000297/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000298 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100340{
341 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
342 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100343 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100344
345 float2 pixels = 0.0f;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100346
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100347 // Extract channel and linearized batch indices
348 const int channel = get_global_id(2) % DST_CHANNELS;
349 const int batch = get_global_id(2) / DST_CHANNELS;
350 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100351
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100352 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100353
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100354 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100355
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100356 // Load the weights
357 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
360
361 pixels = convolution3x3(src_addr, src_stride_y,
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100365#if defined(HAS_BIAS)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100366 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
367
368 float bias = *((__global float *)(vector_offset(&biases, channel)));
369
370 pixels += (float2)bias;
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100371#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100372
Giorgio Arenad056e572020-10-12 11:53:51 +0100373 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100374}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100375#endif //defined(CONV_STRIDE_X)
376
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100377#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100378
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100379/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100380 *
381 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
383 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
384 * @param[in] y_offset Offset from the source tensor from which to start convolution
385 * @param[in] weights_addr Pointer from where to get weights
386 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
387 */
388inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
390{
391 // Load the weights
392 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
395
396 float2 pixels0 = 0.0f;
397
398 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
399 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401
402 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
403 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405
406 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
407 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409
410 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
411 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
412 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
413
414 return pixels0;
415}
416
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100417/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100418 *
419 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
420 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
422 * @param[in] y_offset Offset from the source tensor from which to start convolution
423 * @param[in] weights_addr Pointer from where to get weights
424 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
425 */
426inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
428{
429 // Load the weights
430 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
433
434 float2 pixels0 = 0.0f;
435
436 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
437 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439
440 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
441 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443
444 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
445 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
451
452 return pixels0;
453}
454
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100455#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100456
Gian Marcoc799ed82018-02-01 16:57:48 +0000457/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
458 * stride_x and stride_y are equal to 1
459 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100460 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100461 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
462 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
463 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100464 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000465 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
466 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000467 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000468 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000469 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000470 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000471 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
472 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
473 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
482 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
493 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000494__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000495 TENSOR3D_DECLARATION(src),
496 TENSOR3D_DECLARATION(dst),
497 TENSOR3D_DECLARATION(weights)
498#if defined(HAS_BIAS)
499 ,
500 VECTOR_DECLARATION(biases)
501#endif //defined(HAS_BIAS)
502)
503{
504 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
505 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100506 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000507
508 float2 pixels0 = 0.0f;
509 float2 pixels1 = 0.0f;
510 float2 pixels2 = 0.0f;
511 float2 pixels3 = 0.0f;
512
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100513 // Extract channel and linearized batch indices
514 const int channel = get_global_id(2) % DST_CHANNELS;
515 const int batch = get_global_id(2) / DST_CHANNELS;
516 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
517 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
518 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000519
Usama Arife73686a2019-04-08 17:30:48 +0100520#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 // Load the weights
522 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
525
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000526 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000527 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
528 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
529 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
530 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000531 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
532 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000533
534 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
540 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
541 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
542 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
543 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
544 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
545 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
546
Usama Arife73686a2019-04-08 17:30:48 +0100547#else /* DILATION_X==1 && DILATION_Y==1 */
548
549 //3x3 Convolution of elements starting in 0th row
550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
551 //3x3 Convolution of elements starting in 1st row
552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
553 //3x3 Convolution of elements starting in 2nd row
554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
555 //3x3 Convolution of elements starting in 3rd row
556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
557
558#endif /* DILATION_X==1 && DILATION_Y==1 */
559
Gian Marcoc799ed82018-02-01 16:57:48 +0000560#ifdef HAS_BIAS
561 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
562
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100563 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000564
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
569#endif /* defined(HAS_BIAS) */
570
Giorgio Arenad056e572020-10-12 11:53:51 +0100571 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000575}
576
577/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
578 * stride_x and stride_y are equal to 2
579 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100580 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100581 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
582 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
583 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100584 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000614__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
624 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
625 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100626 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000627
628 float2 pixels0 = 0.0f;
629 float2 pixels1 = 0.0f;
630
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100631 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000632 const int channel = get_global_id(2) % DST_CHANNELS;
633 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100634 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
635 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
636 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000637
Usama Arife73686a2019-04-08 17:30:48 +0100638#if(DILATION_X == 1 && DILATION_Y == 1)
639
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
657 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
663
Usama Arife73686a2019-04-08 17:30:48 +0100664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
668 //3x3 Convolution of elements starting in 2nd row
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
Gian Marcoc799ed82018-02-01 16:57:48 +0000672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
Giorgio Arenad056e572020-10-12 11:53:51 +0100681 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000683}
684
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100686
giuros016d109962019-01-07 17:47:19 +0000687#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
688/** Reshape the weights for quantized depthwise convolution
689 *
690 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
691 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
692 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
693 * @attention Input's height and width should be 3
694 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100695 * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
giuros016d109962019-01-07 17:47:19 +0000696 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
697 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
698 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
699 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
700 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
701 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
702 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
703 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
704 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
705 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
706 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
707 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
708 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
709 */
710__kernel void depthwise_convolution_reshape_weights(
711 TENSOR3D_DECLARATION(src),
712 IMAGE_DECLARATION(dst))
713{
714 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
715 const int x = get_global_id(0);
716
717 // Load 3x3xVEC_SIZE weights
718 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
719 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
720 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
721 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
722 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
723 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
724 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
725 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
726 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
727 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
728 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
729 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
730 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
731 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
732 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
733 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
734 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
735 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
736
737 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
738
739#if defined(TRANSPOSE)
740#if VEC_SIZE != 4
741#error "VEC_SIZE not supported"
742#else // VEC_SIZE != 4
743 VSTORE(VEC_SIZE)
744 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
745 VSTORE(VEC_SIZE)
746 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
747 VSTORE(VEC_SIZE)
748 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
749 VSTORE(VEC_SIZE)
750 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
751 VSTORE(VEC_SIZE)
752 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
753 VSTORE(VEC_SIZE)
754 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
755 VSTORE(VEC_SIZE)
756 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
757 VSTORE(VEC_SIZE)
758 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
759 VSTORE(VEC_SIZE)
760 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
761#endif // VEC_SIZE != 4
762#else // !defined(TRANSPOSE)
763 VSTORE(VEC_SIZE)
764 (w0, 0, dst_addr + 0);
765 VSTORE(VEC_SIZE)
766 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
767 VSTORE(VEC_SIZE)
768 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
769 VSTORE(VEC_SIZE)
770 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
771 VSTORE(VEC_SIZE)
772 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
773 VSTORE(VEC_SIZE)
774 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
775 VSTORE(VEC_SIZE)
776 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
777 VSTORE(VEC_SIZE)
778 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
779 VSTORE(VEC_SIZE)
780 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
781#endif // defined(TRANSPOSE)
782}
783#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
784
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100785#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000786#if defined(CONV_STRIDE_X)
787#if CONV_STRIDE_X == 1
788#define convolution1x3_f16 convolution1x3_stride_1_f16
789#elif CONV_STRIDE_X == 2
790#define convolution1x3_f16 convolution1x3_stride_2_f16
791#elif CONV_STRIDE_X == 3
792#define convolution1x3_f16 convolution1x3_stride_3_f16
793#else /* CONV_STRIDE_X */
794#error "Stride not supported"
795#endif /* CONV_STRIDE_X */
796
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100797#if(DILATION_X > 1 || DILATION_Y > 1)
798
799/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
800 *
801 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
802 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
803 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
804 * @param[in] y_offset Offset from the source tensor from which to start convolution
805 * @param[in] weights_addr Pointer from where to get weights
806 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
807 */
808inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
809 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
810{
811 // Load the weights
812 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
813 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
814 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
815
816 half4 pixels0 = 0.0f;
817
818 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
819 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
820 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
821
822 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
823 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
824 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
825
826 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
827 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
828 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
829
830 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
831 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
832 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
833
834 return pixels0;
835}
836
837/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
838 *
839 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
840 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
841 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
842 * @param[in] y_offset Offset from the source tensor from which to start convolution
843 * @param[in] weights_addr Pointer from where to get weights
844 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
845 */
846inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
847 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
848{
849 // Load the weights
850 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
851 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
852 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
853
854 half4 pixels0 = 0.0f;
855
856 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
857 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
858 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
859
860 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
861 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
862 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
863
864 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
865 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
866 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
867
868 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
869 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
870 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
871
872 return pixels0;
873}
874
875#endif // (DILATION_X > 1 && DILATION_Y > 1)
876
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000877/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
878 *
879 * @param[in] left_pixel Pointer to the left pixel.
880 * @param[in] left_coeff Weight of the left pixel
881 * @param[in] middle_coeff Weight of the middle pixel
882 * @param[in] right_coeff Weight of the right pixel
883 *
884 * @return a half4 containing 4 convoluted values.
885 */
886inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
887 const half left_coeff,
888 const half middle_coeff,
889 const half right_coeff)
890{
Usama Arife73686a2019-04-08 17:30:48 +0100891#if(DILATION_X == 1 && DILATION_Y == 1)
892
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000893 half8 temp = vload8(0, (__global half *)left_pixel);
894
895 half4 left = CONVERT(temp.s0123, half4);
896 half4 middle = CONVERT(temp.s1234, half4);
897 half4 right = CONVERT(temp.s2345, half4);
898
899 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100900#else /* DILATION_X==1 && DILATION_Y==1 */
901 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
902 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
903 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
904
905#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000906}
907
908/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
909 *
910 * @param[in] left_pixel Pointer to the left pixel.
911 * @param[in] left_coeff Weight of the left pixel
912 * @param[in] middle_coeff Weight of the middle pixel
913 * @param[in] right_coeff Weight of the right pixel
914 *
915 * @return a half4 containing 4 convoluted values.
916 */
917inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
918 const half left_coeff,
919 const half middle_coeff,
920 const half right_coeff)
921{
Usama Arife73686a2019-04-08 17:30:48 +0100922#if(DILATION_X == 1 && DILATION_Y == 1)
923
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000924 half8 temp0 = vload8(0, (__global half *)left_pixel);
925 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
926
927 half4 left = CONVERT(temp0.s0246, half4);
928 half4 middle = CONVERT(temp0.s1357, half4);
929 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
930
931 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100932#else /* DILATION_X==1 && DILATION_Y==1 */
933
934 __global half *left_pixel_float = (__global half *)left_pixel;
935
936 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
937 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
938 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
939
940#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000941}
942
943/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
944 *
945 * @param[in] left_pixel Pointer to the left pixel.
946 * @param[in] left_coeff Weight of the left pixel
947 * @param[in] middle_coeff Weight of the middle pixel
948 * @param[in] right_coeff Weight of the right pixel
949 *
950 * @return a half4 containing 4 convoluted values.
951 */
952inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
953 const half left_coeff,
954 const half middle_coeff,
955 const half right_coeff)
956{
Usama Arife73686a2019-04-08 17:30:48 +0100957#if(DILATION_X == 1 && DILATION_Y == 1)
958
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000959 half16 temp0 = vload16(0, (__global half *)left_pixel);
960
961 half4 left = CONVERT(temp0.s0369, half4);
962 half4 middle = CONVERT(temp0.s147A, half4);
963 half4 right = CONVERT(temp0.s258B, half4);
964
965 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100966#else /* DILATION_X==1 && DILATION_Y==1 */
967
968 __global half *left_pixel_float = (__global half *)left_pixel;
969
970 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
971 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
972 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
973
974#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000975}
976
977/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
978 *
979 * Convolution matrix layout:
980 *
981 * [ mat0, mat1, mat2 ]\n
982 * [ mat3, mat4, mat5 ]\n
983 * [ mat6, mat7, mat8 ]\n
984 *
985 * @param[in] src A pointer to source Image structure
986 * @param[in] mat0 Coefficient from the convolution matrix
987 * @param[in] mat1 Coefficient from the convolution matrix
988 * @param[in] mat2 Coefficient from the convolution matrix
989 * @param[in] mat3 Coefficient from the convolution matrix
990 * @param[in] mat4 Coefficient from the convolution matrix
991 * @param[in] mat5 Coefficient from the convolution matrix
992 * @param[in] mat6 Coefficient from the convolution matrix
993 * @param[in] mat0 Coefficient from the convolution matrix
994 * @param[in] mat7 Coefficient from the convolution matrix
995 * @param[in] mat8 Coefficient from the convolution matrix
996 *
997 * @return a half4 containing 4 convoluted values.
998 */
999inline half4 convolution3x3_f16(
1000 Image *src,
1001 const half mat0, const half mat1, const half mat2,
1002 const half mat3, const half mat4, const half mat5,
1003 const half mat6, const half mat7, const half mat8)
1004{
1005 half4 pixels;
1006
1007 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001008 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1009 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001010
1011 return pixels;
1012}
1013
Giorgio Arena76572242018-04-04 17:44:26 +01001014#if defined(DEPTH_MULTIPLIER)
1015
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001016/** This OpenCL kernel computes the depthwise convolution 3x3
1017 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001018 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001019 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1020 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1021 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001022 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001023 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1024 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001025 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001026 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001027 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001028 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1029 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001030 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001031 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001032 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1033 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1034 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1035 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1036 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1037 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1038 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001039 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001040 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1041 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1042 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1043 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1044 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1045 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1046 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001047 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001048 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1049 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1050 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1051 */
1052__kernel void depthwise_convolution_3x3_f16(
1053 TENSOR3D_DECLARATION(src),
1054 TENSOR3D_DECLARATION(dst),
1055 TENSOR3D_DECLARATION(weights)
1056#if defined(HAS_BIAS)
1057 ,
1058 VECTOR_DECLARATION(biases)
1059#endif //defined(HAS_BIAS)
1060)
1061{
1062 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1063 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001064 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001065#if defined(HAS_BIAS)
1066 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1067#endif //defined(HAS_BIAS)
1068
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001069 // Extract channel and linearized batch indices
1070 const int channel = get_global_id(2) % DST_CHANNELS;
1071 const int batch = get_global_id(2) / DST_CHANNELS;
1072 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1073 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1074 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001075
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001076 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001077 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1078 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1079 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001080
1081 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1082 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1083 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1084#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001085 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001086#endif //defined(HAS_BIAS)
1087
Giorgio Arenad056e572020-10-12 11:53:51 +01001088 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001089}
Giorgio Arena76572242018-04-04 17:44:26 +01001090#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001091#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001092
1093/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1094 * when both stride_x and stride_y are equal to 1
1095 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001096 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001097 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1098 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1099 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001100 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001101 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1102 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001103 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001104 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001105 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001106 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1107 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001108 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001109 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1110 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1111 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1112 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1113 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1114 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1115 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1116 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1117 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1118 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1119 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1120 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1121 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1122 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1123 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1124 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1125 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1126 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1127 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1128 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1129 */
1130__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1131 TENSOR3D_DECLARATION(src),
1132 TENSOR3D_DECLARATION(dst),
1133 TENSOR3D_DECLARATION(weights)
1134#if defined(HAS_BIAS)
1135 ,
1136 VECTOR_DECLARATION(biases)
1137#endif //defined(HAS_BIAS)
1138)
1139{
1140 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1141 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001142 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1143
1144 // Extract channel and linearized batch indices
1145 const int channel = get_global_id(2) % DST_CHANNELS;
1146 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001147
1148#ifdef HAS_BIAS
1149 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1150
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001151 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001152#endif /* defined(HAS_BIAS) */
1153
1154 half4 pixels0 = 0.0f;
1155 half4 pixels1 = 0.0f;
1156 half4 pixels2 = 0.0f;
1157 half4 pixels3 = 0.0f;
1158
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001159 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1160 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1161 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001162
Usama Arife73686a2019-04-08 17:30:48 +01001163#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001164 // Load the weights
1165 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1166 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1167 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1168
1169 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1170 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1171 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1172 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1173 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1174 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1175 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1176
1177 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1178 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1179 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1180 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1181 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1182 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1183 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1184 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1185 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1186 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1187 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1188 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1189
Usama Arife73686a2019-04-08 17:30:48 +01001190#else /* DILATION_X==1 && DILATION_Y==1 */
1191
1192 //3x3 Convolution of elements starting in 0th row
1193 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1194 //3x3 Convolution of elements starting in 1st row
1195 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1196 //3x3 Convolution of elements starting in 2nd row
1197 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1198 //3x3 Convolution of elements starting in 3rd row
1199 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1200
1201#endif /* DILATION_X==1 && DILATION_Y==1 */
1202
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001203#ifdef HAS_BIAS
1204 pixels0 += (half4)bias;
1205 pixels1 += (half4)bias;
1206 pixels2 += (half4)bias;
1207 pixels3 += (half4)bias;
1208#endif /* defined(HAS_BIAS) */
1209
Giorgio Arenad056e572020-10-12 11:53:51 +01001210 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1211 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1212 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1213 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001214}
1215
1216/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1217 * when both stride_x and stride_y are equal to 2
1218 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001219 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001220 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1221 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1222 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001223 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001224 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1225 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001226 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001227 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001228 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001229 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001230 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1231 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001232 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1233 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1234 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1235 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1236 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1237 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1238 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1239 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1240 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1241 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1242 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1243 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1244 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1245 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1246 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1247 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1248 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1249 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1250 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1251 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1252 */
1253__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1254 TENSOR3D_DECLARATION(src),
1255 TENSOR3D_DECLARATION(dst),
1256 TENSOR3D_DECLARATION(weights)
1257#if defined(HAS_BIAS)
1258 ,
1259 VECTOR_DECLARATION(biases)
1260#endif //defined(HAS_BIAS)
1261)
1262{
1263 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1264 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001265 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1266
1267 // Extract channel and linearized batch indices
1268 const int channel = get_global_id(2) % DST_CHANNELS;
1269 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001270
1271#ifdef HAS_BIAS
1272 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1273
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001274 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001275#endif /* defined(HAS_BIAS) */
1276
1277 half4 pixels0 = 0.0f;
1278 half4 pixels1 = 0.0f;
1279
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001280 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1281 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1282 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001283
Usama Arife73686a2019-04-08 17:30:48 +01001284#if(DILATION_X == 1 && DILATION_Y == 1)
1285
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001286 // Load the weights
1287 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1288 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1289 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1290
1291 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1292 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1293 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1294 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1295 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1296 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1297 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1298 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1299 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1300 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1301 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1302
1303 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1304 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1305 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1306 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1307 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1308 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1309
Usama Arife73686a2019-04-08 17:30:48 +01001310#else /* DILATION_X==1 && DILATION_Y==1 */
1311 //3x3 Convolution of elements starting in 0th row
1312 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1313 //3x3 Convolution of elements starting in 2nd row
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001314 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001315#endif /* DILATION_X==1 && DILATION_Y==1 */
1316
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001317#ifdef HAS_BIAS
1318 pixels0 += (half4)bias;
1319 pixels1 += (half4)bias;
1320#endif /* defined(HAS_BIAS) */
1321
Giorgio Arenad056e572020-10-12 11:53:51 +01001322 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1323 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001324}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001325#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001326
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001327#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1328/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1329 *
1330 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1331 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1332 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1333 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1334 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1335 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1336 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1337 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1338 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1339 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1340 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1341 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1342 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1343 *
1344 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1345 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1346 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1347 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1348 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1349 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1350 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1351 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1352 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1353 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1354 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1355 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1356 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1357 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1358 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1359 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1360 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1361 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1362 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1363 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1364 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1365 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1366 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1367 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1368 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1369 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1370 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1371 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1372 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1373 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1374 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1375 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1376 */
1377__kernel void dwc_MxN_native_fp_nhwc(
1378 TENSOR4D_DECLARATION(src),
1379 TENSOR4D_DECLARATION(dst),
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001380 TENSOR3D_DECLARATION(weights)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001381#if defined(HAS_BIAS)
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001382 ,
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001383 VECTOR_DECLARATION(biases)
1384#endif // defined(HAS_BIAS)
1385)
1386{
1387 int x = get_global_id(0); // channels
1388 int y = get_global_id(1); // spatial coordinate x
1389#if defined(DST_DEPTH)
1390 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1391 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1392#else // defined(DST_DEPTH)
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001393 int z = get_global_id(2); // spatial coordinate y
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001394#endif // defined(DST_DEPTH)
1395
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001396 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001397
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001398 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001399
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001400 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001401
1402#if defined(HAS_BIAS)
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001403 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001404#endif // defined(HAS_BIAS)
1405
1406#if defined(DST_DEPTH)
1407 s_addr += b * src_stride_w;
1408 d_addr += b * dst_stride_w;
1409#endif // defined(DST_DEPTH)
1410
1411 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1412 {
1413 // Each work-item computes N0x1x1 elements
1414 VEC_DATA_TYPE(DATA_TYPE, N0)
1415 res = 0;
1416
1417 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1418 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1419
1420 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1421 {
1422 if(y_coord >= 0 && y_coord < SRC_DIM2)
1423 {
1424 int x_coord_tmp = x_coord;
1425
1426 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1427 {
1428 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1429 {
1430 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1431 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1432
1433 // Load input and weights values
1434 VEC_DATA_TYPE(DATA_TYPE, N0)
1435 i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
1436 VEC_DATA_TYPE(DATA_TYPE, N0)
1437 w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
1438
1439#if GPU_ARCH == GPU_ARCH_MIDGARD
1440 res += i * w;
1441#else // GPU_ARCH == GPU_ARCH_MIDGARD
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001442 res = fma(i, w, res);
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001443#endif // GPU_ARCH == GPU_ARCH_MIDGARD
1444 }
1445 x_coord_tmp += DILATION_X;
1446 }
1447 }
1448 y_coord += DILATION_Y;
1449 }
1450
1451#if defined(HAS_BIAS)
1452 res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
1453#endif // defined(HAS_BIAS)
1454
Giorgio Arenad056e572020-10-12 11:53:51 +01001455 res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001456
1457 VSTORE(N0)
1458 (res, 0, (__global DATA_TYPE *)(d_addr));
1459
1460 w_addr += sizeof(DATA_TYPE);
1461 d_addr += sizeof(DATA_TYPE);
1462#if defined(HAS_BIAS)
1463 b_addr += sizeof(DATA_TYPE);
1464#endif // defined(HAS_BIAS)
1465 }
1466}
1467#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1468
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001469#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001470
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001471#if DATA_TYPE != float || DATA_TYPE != half
1472#error "Unsupported data type"
1473#endif // DATA_TYPE != float || DATA_TYPE != half
1474
1475#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001476
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001477#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \
1478 ({ \
1479 basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s0)); \
1480 basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s1)); \
1481 basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s2)); \
1482 })
1483
1484#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \
1485 ({ \
1486 FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \
1487 basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_DATA_TYPE(data_type, vec_size))((cond).s3)); \
1488 })
1489
Giorgio Arenad051e972018-06-20 11:46:42 +01001490#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001491
Giorgio Arenad051e972018-06-20 11:46:42 +01001492/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1493 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001494 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001495 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1496 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1497 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1498 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1499 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1500 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001501 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Usama Arif6a98a6e2019-05-10 17:07:27 +01001502 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001503 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1504 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001505 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1506 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1507 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
Giorgio Arenad051e972018-06-20 11:46:42 +01001508 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001509 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001510 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001511 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001512 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001513 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001514 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001515 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1516 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1517 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1518 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001519 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1520 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1521 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1522 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1523 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1524 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1525 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001526 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1527 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001528 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001529 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001530 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1531 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1532 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1533 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1534 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1535 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1536 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1537 * @param[in] max_offset Max offset for the input tensor
1538 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1539 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1540 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1541 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1542 */
1543__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001544 TENSOR4D_DECLARATION(src),
1545 TENSOR4D_DECLARATION(dst),
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001546 TENSOR3D_DECLARATION(weights)
Giorgio Arenad051e972018-06-20 11:46:42 +01001547#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001548 ,
1549 VECTOR_DECLARATION(biases)
Giorgio Arenad051e972018-06-20 11:46:42 +01001550#endif /* defined(HAS_BIAS) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001551)
Giorgio Arenad051e972018-06-20 11:46:42 +01001552{
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001553 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1554 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001555#if defined(DST_DEPTH)
1556 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1557 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001558#else // defined(DST_DEPTH)
1559 int z = get_global_id(2); // spatial coordinate y
1560#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001561
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001562 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001563
Georgios Pinitas37044642018-10-30 14:53:25 +00001564#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001565 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001566#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001567 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
Georgios Pinitas37044642018-10-30 14:53:25 +00001568#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001569
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001570 int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
1571 int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
Giorgio Arenad051e972018-06-20 11:46:42 +01001572
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001573 int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
1574 int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
1575
1576 // Use these vectors to check whether the unclamped load would have been out of bounds
1577 src_coord_y = (src_offset_y != src_coord_y);
1578 src_coord_z = (src_offset_z != src_coord_z);
1579
1580 src_offset_y *= (int3)src_stride_y;
1581 src_offset_z *= (int3)src_stride_z;
1582
1583 // We compute VEC_SIZEx1x1 [C,W,H] elements
1584 VEC_FLOAT acc0 = 0;
Giorgio Arenad051e972018-06-20 11:46:42 +01001585
1586 // Load weights
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001587 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1588 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1589 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1590 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1591 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1592 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1593 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1594 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1595 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001596
1597 // Load input values
1598 // z == 0
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001599 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1600 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1601 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001602
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001603 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
1604
1605 acc0 = fma(values0, w0, acc0);
1606 acc0 = fma(values1, w1, acc0);
1607 acc0 = fma(values2, w2, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001608
1609 // z == 1
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001610 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1611 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1612 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1613
1614 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
1615
1616 acc0 = fma(values0, w3, acc0);
1617 acc0 = fma(values1, w4, acc0);
1618 acc0 = fma(values2, w5, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001619
1620 // z == 2
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001621 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1622 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1623 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001624
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001625 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
Giorgio Arenad051e972018-06-20 11:46:42 +01001626
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001627 acc0 = fma(values0, w6, acc0);
1628 acc0 = fma(values1, w7, acc0);
1629 acc0 = fma(values2, w8, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001630
1631#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001632 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1633 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
1634 acc0 += bias_values;
Giorgio Arenad051e972018-06-20 11:46:42 +01001635#endif // defined(HAS_BIAS)
1636
Georgios Pinitas37044642018-10-30 14:53:25 +00001637#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001638 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001639#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001640 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001641#endif /* defined(DST_DEPTH) */
1642
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001643 acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1644 STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001645}
1646#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1647
1648#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1649/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1650 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001651 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001652 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1653 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1654 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1655 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1656 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1657 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001658 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001659 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1660 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001661 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1662 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1663 * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
1664 * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
1665 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1666 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
Giorgio Arenad051e972018-06-20 11:46:42 +01001667 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001668 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001669 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001670 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001671 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001672 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001673 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001674 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1675 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1676 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1677 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001678 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1679 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1680 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1681 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1682 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1683 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1684 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001685 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1686 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001687 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001688 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001689 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1690 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1691 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1692 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1693 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1694 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1695 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1696 * @param[in] max_offset Max offset for the input tensor
1697 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1698 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1699 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1700 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1701 */
1702__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001703 TENSOR4D_DECLARATION(src),
1704 TENSOR4D_DECLARATION(dst),
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001705 TENSOR3D_DECLARATION(weights)
Giorgio Arenad051e972018-06-20 11:46:42 +01001706#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001707 ,
1708 VECTOR_DECLARATION(biases)
Giorgio Arenad051e972018-06-20 11:46:42 +01001709#endif /* defined(HAS_BIAS) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001710)
Giorgio Arenad051e972018-06-20 11:46:42 +01001711{
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001712 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1713 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001714#if defined(DST_DEPTH)
1715 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1716 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001717#else // defined(DST_DEPTH)
1718 int z = get_global_id(2); // spatial coordinate y
1719#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001720
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001721 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001722
Georgios Pinitas37044642018-10-30 14:53:25 +00001723#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001724 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001725#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001726 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
Georgios Pinitas37044642018-10-30 14:53:25 +00001727#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001728
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001729 int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int4);
1730 int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int4);
Giorgio Arenad051e972018-06-20 11:46:42 +01001731
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001732 int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
1733 int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
1734
1735 // Use these vectors to check whether the unclamped load would have been out of bounds
1736 src_coord_y = (src_offset_y != src_coord_y);
1737 src_coord_z = (src_offset_z != src_coord_z);
1738
1739 src_offset_y *= (int4)src_stride_y;
1740 src_offset_z *= (int4)src_stride_z;
1741
1742 // We compute VEC_SIZEx2x2 [C,W,H] elements
Giorgio Arenad051e972018-06-20 11:46:42 +01001743 VEC_FLOAT acc0 = 0;
1744 VEC_FLOAT acc1 = 0;
1745 VEC_FLOAT acc2 = 0;
1746 VEC_FLOAT acc3 = 0;
1747
1748 // Load weights
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001749 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1750 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1751 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1752 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1753 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1754 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1755 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1756 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1757 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001758
1759 // Load input values
1760 // z == 0
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001761 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1762 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1763 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
1764 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001765
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001766 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001767
1768 acc0 = fma(values0, w0, acc0);
1769 acc0 = fma(values1, w1, acc0);
1770 acc0 = fma(values2, w2, acc0);
1771 acc1 = fma(values1, w0, acc1);
1772 acc1 = fma(values2, w1, acc1);
1773 acc1 = fma(values3, w2, acc1);
1774
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001775 // z == 1
1776 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1777 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1778 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1779 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001780
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001781 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
Giorgio Arenad051e972018-06-20 11:46:42 +01001782
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001783 acc0 = fma(values0, w3, acc0);
1784 acc0 = fma(values1, w4, acc0);
1785 acc0 = fma(values2, w5, acc0);
1786 acc1 = fma(values1, w3, acc1);
1787 acc1 = fma(values2, w4, acc1);
1788 acc1 = fma(values3, w5, acc1);
Giorgio Arenad051e972018-06-20 11:46:42 +01001789
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001790 acc2 = fma(values0, w0, acc2);
1791 acc2 = fma(values1, w1, acc2);
1792 acc2 = fma(values2, w2, acc2);
1793 acc3 = fma(values1, w0, acc3);
1794 acc3 = fma(values2, w1, acc3);
1795 acc3 = fma(values3, w2, acc3);
Giorgio Arenad051e972018-06-20 11:46:42 +01001796
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001797 // z == 2
1798 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1799 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1800 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1801 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
1802
1803 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
1804
1805 acc0 = fma(values0, w6, acc0);
1806 acc0 = fma(values1, w7, acc0);
1807 acc0 = fma(values2, w8, acc0);
1808 acc1 = fma(values1, w6, acc1);
1809 acc1 = fma(values2, w7, acc1);
1810 acc1 = fma(values3, w8, acc1);
1811
1812 acc2 = fma(values0, w3, acc2);
1813 acc2 = fma(values1, w4, acc2);
1814 acc2 = fma(values2, w5, acc2);
1815 acc3 = fma(values1, w3, acc3);
1816 acc3 = fma(values2, w4, acc3);
1817 acc3 = fma(values3, w5, acc3);
1818
1819 // z == 3
1820 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
1821 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
1822 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
1823 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
1824
1825 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
1826
1827 acc2 = fma(values0, w6, acc2);
1828 acc2 = fma(values1, w7, acc2);
1829 acc2 = fma(values2, w8, acc2);
1830 acc3 = fma(values1, w6, acc3);
1831 acc3 = fma(values2, w7, acc3);
1832 acc3 = fma(values3, w8, acc3);
Giorgio Arenad051e972018-06-20 11:46:42 +01001833
1834#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001835 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001836
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001837 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001838
1839 acc0 += bias_values;
1840 acc1 += bias_values;
1841 acc2 += bias_values;
1842 acc3 += bias_values;
1843#endif // defined(HAS_BIAS)
1844
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001845 int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int2), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
1846 int dst_coord_z = z * NUM_PLANES_PROCESSED;
Giorgio Arenad051e972018-06-20 11:46:42 +01001847
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001848#if defined(DST_DEPTH)
1849 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
1850#else // defined(DST_DEPTH)
1851 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
1852#endif // defined(DST_DEPTH)
1853
1854 /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
1855 * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
1856 * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
1857 */
1858 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
1859 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1860
1861 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1862 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001863
1864#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001865 if((dst_coord_z + 1) < DST_DIM_2)
Giorgio Arenad051e972018-06-20 11:46:42 +01001866#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1867 {
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001868 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
1869 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1870
1871 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
1872 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001873 }
1874}
1875
1876#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +01001877#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)