blob: 22a38e70941b8204425066a9a6390b42a95b3170 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Giorgio Arenadcf4c872021-04-16 12:41:45 +01002 * Copyright (c) 2017-2021 Arm Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010024#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010027
28/** Get the pointer position at a certain offset in x and y direction.
29 *
30 * @param[in] ptr Pointer to the starting position of the buffer
31 * @param[in] x Relative X position
32 * @param[in] y Relative Y position
33 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
34 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
35 *
36 * @return a uchar
37 */
38inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
39{
40 return ptr + x * stride_x + y * stride_y;
41}
42
43#if(DILATION_X == 1 && DILATION_Y == 1)
44
Giorgio Arenadcf4c872021-04-16 12:41:45 +010045#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0, weights_row0) \
46 ({ \
47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +010053 })
54
Giorgio Arenadcf4c872021-04-16 12:41:45 +010055#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0, weights_row0) \
56 ({ \
57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +010069 })
70
Giorgio Arenadcf4c872021-04-16 12:41:45 +010071#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0, src1, weights_row0) \
72 ({ \
73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +010079 })
80
Giorgio Arenadcf4c872021-04-16 12:41:45 +010081#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0, src1, weights_row0) \
82 ({ \
83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +010095 })
96
97#else /* DILATION_X==1 && DILATION_Y==1 */
98
Giorgio Arenadcf4c872021-04-16 12:41:45 +010099#define CONVOLUTION1x3_2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
100 ({ \
101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100107 })
108
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100109#define CONVOLUTION1x3_2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
110 ({ \
111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100117 })
118
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100119#define CONVOLUTION1x3_4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
120 ({ \
121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100133 })
134
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100135#define CONVOLUTION1x3_4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
136 ({ \
137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100149 })
150
151#endif /* DILATION_X==1 && DILATION_Y==1 */
152
153#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100154#if defined(CONV_STRIDE_X)
155
Giorgio Arena93a690e2017-08-01 16:09:33 +0100156#if CONV_STRIDE_X == 1
157#define convolution1x3 convolution1x3_stride_1
158#elif CONV_STRIDE_X == 2
159#define convolution1x3 convolution1x3_stride_2
160#elif CONV_STRIDE_X == 3
161#define convolution1x3 convolution1x3_stride_3
162#else /* CONV_STRIDE_X */
163#error "Stride not supported"
164#endif /* CONV_STRIDE_X */
165
166/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
167 *
168 * @param[in] left_pixel Pointer to the left pixel.
169 * @param[in] left_coeff Weight of the left pixel
170 * @param[in] middle_coeff Weight of the middle pixel
171 * @param[in] right_coeff Weight of the right pixel
172 *
173 * @return a float2 containing 2 convoluted values.
174 */
175inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
179{
Usama Arife73686a2019-04-08 17:30:48 +0100180#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100181 float4 temp = vload4(0, (__global float *)left_pixel);
182
183 float2 left = CONVERT(temp.s01, float2);
184 float2 middle = CONVERT(temp.s12, float2);
185 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100187#else /* DILATION_X==1 && DILATION_Y==1 */
188 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
191#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100192}
193
194/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
195 *
196 * @param[in] left_pixel Pointer to the left pixel.
197 * @param[in] left_coeff Weight of the left pixel
198 * @param[in] middle_coeff Weight of the middle pixel
199 * @param[in] right_coeff Weight of the right pixel
200 *
201 * @return a float2 containing 2 convoluted values.
202 */
203inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
207{
Usama Arife73686a2019-04-08 17:30:48 +0100208#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209 float4 temp0 = vload4(0, (__global float *)left_pixel);
210 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
211
212 float2 left = CONVERT(temp0.s02, float2);
213 float2 middle = CONVERT(temp0.s13, float2);
214 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
215
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100217#else /* DILATION_X==1 && DILATION_Y==1 */
218 __global float *left_pixel_float = (__global float *)left_pixel;
219
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
223
224#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100225}
226
227/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
228 *
229 * @param[in] left_pixel Pointer to the left pixel.
230 * @param[in] left_coeff Weight of the left pixel
231 * @param[in] middle_coeff Weight of the middle pixel
232 * @param[in] right_coeff Weight of the right pixel
233 *
234 * @return a float2 containing 2 convoluted values.
235 */
236inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
240{
Usama Arife73686a2019-04-08 17:30:48 +0100241#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100242 float4 temp0 = vload4(0, (__global float *)left_pixel);
243 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
244
245 float2 left = CONVERT(temp0.s03, float2);
246 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
248
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100250#else /* DILATION_X==1 && DILATION_Y==1 */
251 __global float *left_pixel_float = (__global float *)left_pixel;
252
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
256#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100257}
258
259/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
260 *
261 * Convolution matrix layout:
262 *
263 * [ mat0, mat1, mat2 ]\n
264 * [ mat3, mat4, mat5 ]\n
265 * [ mat6, mat7, mat8 ]\n
266 *
267 * @param[in] src A pointer to source Image structure
268 * @param[in] mat0 Coefficient from the convolution matrix
269 * @param[in] mat1 Coefficient from the convolution matrix
270 * @param[in] mat2 Coefficient from the convolution matrix
271 * @param[in] mat3 Coefficient from the convolution matrix
272 * @param[in] mat4 Coefficient from the convolution matrix
273 * @param[in] mat5 Coefficient from the convolution matrix
274 * @param[in] mat6 Coefficient from the convolution matrix
275 * @param[in] mat0 Coefficient from the convolution matrix
276 * @param[in] mat7 Coefficient from the convolution matrix
277 * @param[in] mat8 Coefficient from the convolution matrix
278 *
279 * @return a float2 containing 2 convoluted values.
280 */
281inline float2 convolution3x3(
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100282 __global const uchar *src,
283 unsigned int src_stride_y,
Giorgio Arena93a690e2017-08-01 16:09:33 +0100284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100290 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100293
294 return pixels;
295}
296
Gian Marcoc799ed82018-02-01 16:57:48 +0000297/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000298 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100340{
Giorgio Arena93a690e2017-08-01 16:09:33 +0100341 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100342 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100343
344 float2 pixels = 0.0f;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100345
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100346 // Extract channel and linearized batch indices
347 const int channel = get_global_id(2) % DST_CHANNELS;
348 const int batch = get_global_id(2) / DST_CHANNELS;
349 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100350
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100351 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100352
Giorgio Arena15bc8482020-12-08 14:34:00 +0000353 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
354 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100355
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100356 // Load the weights
357 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
360
361 pixels = convolution3x3(src_addr, src_stride_y,
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100365#if defined(HAS_BIAS)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100366 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
367
368 float bias = *((__global float *)(vector_offset(&biases, channel)));
369
370 pixels += (float2)bias;
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100371#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100372
Giorgio Arenad056e572020-10-12 11:53:51 +0100373 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100374}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100375#endif //defined(CONV_STRIDE_X)
376
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100377#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100378
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100379/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100380 *
381 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
383 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
384 * @param[in] y_offset Offset from the source tensor from which to start convolution
385 * @param[in] weights_addr Pointer from where to get weights
386 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
387 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100388inline float2 convolution_3x3_dilation_stridex1_stridey1_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
Usama Arife73686a2019-04-08 17:30:48 +0100390{
391 // Load the weights
392 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
395
396 float2 pixels0 = 0.0f;
397
398 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
399 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401
402 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
403 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405
406 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
407 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100410 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
411 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
412 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
Usama Arife73686a2019-04-08 17:30:48 +0100413
414 return pixels0;
415}
416
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100417/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100418 *
419 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
420 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
422 * @param[in] y_offset Offset from the source tensor from which to start convolution
423 * @param[in] weights_addr Pointer from where to get weights
424 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
425 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100426inline float2 convolution_3x3_dilation_stridex2_stridey2_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
Usama Arife73686a2019-04-08 17:30:48 +0100428{
429 // Load the weights
430 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
433
434 float2 pixels0 = 0.0f;
435
436 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
437 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439
440 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
441 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443
444 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
445 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100448 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
449 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
450 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
Usama Arife73686a2019-04-08 17:30:48 +0100451
452 return pixels0;
453}
454
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100455#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100456
Gian Marcoc799ed82018-02-01 16:57:48 +0000457/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
458 * stride_x and stride_y are equal to 1
459 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100460 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100461 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
462 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
463 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100464 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000465 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
466 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000467 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000468 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000469 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000470 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000471 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
472 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
473 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
482 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
493 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100494__kernel void depthwise_convolution_3x3_stridex1_stridey1_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000495 TENSOR3D_DECLARATION(src),
496 TENSOR3D_DECLARATION(dst),
497 TENSOR3D_DECLARATION(weights)
498#if defined(HAS_BIAS)
499 ,
500 VECTOR_DECLARATION(biases)
501#endif //defined(HAS_BIAS)
502)
503{
Gian Marcoc799ed82018-02-01 16:57:48 +0000504 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100505 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000506
507 float2 pixels0 = 0.0f;
508 float2 pixels1 = 0.0f;
509 float2 pixels2 = 0.0f;
510 float2 pixels3 = 0.0f;
511
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100512 // Extract channel and linearized batch indices
513 const int channel = get_global_id(2) % DST_CHANNELS;
514 const int batch = get_global_id(2) / DST_CHANNELS;
515 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
516 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena15bc8482020-12-08 14:34:00 +0000517 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
518 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000519
Usama Arife73686a2019-04-08 17:30:48 +0100520#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 // Load the weights
522 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
525
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000526 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000527 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
528 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
529 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
530 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000531 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
532 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000533
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100534 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00, weights_row0);
535 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10, weights_row1);
536 CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20, weights_row2);
537 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src10, weights_row0);
538 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src20, weights_row1);
539 CONVOLUTION1x3_2X1_STRIDE1(pixels1, src30, weights_row2);
540 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src20, weights_row0);
541 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src30, weights_row1);
542 CONVOLUTION1x3_2X1_STRIDE1(pixels2, src40, weights_row2);
543 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src30, weights_row0);
544 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src40, weights_row1);
545 CONVOLUTION1x3_2X1_STRIDE1(pixels3, src50, weights_row2);
Gian Marcoc799ed82018-02-01 16:57:48 +0000546
Usama Arife73686a2019-04-08 17:30:48 +0100547#else /* DILATION_X==1 && DILATION_Y==1 */
548
549 //3x3 Convolution of elements starting in 0th row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100551 //3x3 Convolution of elements starting in 1st row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100553 //3x3 Convolution of elements starting in 2nd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100555 //3x3 Convolution of elements starting in 3rd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100557
558#endif /* DILATION_X==1 && DILATION_Y==1 */
559
Gian Marcoc799ed82018-02-01 16:57:48 +0000560#ifdef HAS_BIAS
561 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
562
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100563 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000564
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
569#endif /* defined(HAS_BIAS) */
570
Giorgio Arenad056e572020-10-12 11:53:51 +0100571 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000575}
576
577/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
578 * stride_x and stride_y are equal to 2
579 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100580 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100581 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
582 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
583 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100584 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100614__kernel void depthwise_convolution_3x3_stridex2_stridey2_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
Gian Marcoc799ed82018-02-01 16:57:48 +0000624 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100625 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000626
627 float2 pixels0 = 0.0f;
628 float2 pixels1 = 0.0f;
629
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100630 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000631 const int channel = get_global_id(2) % DST_CHANNELS;
632 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100633 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
634 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena15bc8482020-12-08 14:34:00 +0000635 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
636 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000637
Usama Arife73686a2019-04-08 17:30:48 +0100638#if(DILATION_X == 1 && DILATION_Y == 1)
639
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100657 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_2X1_STRIDE2(pixels1, src40, src41, weights_row2);
Gian Marcoc799ed82018-02-01 16:57:48 +0000663
Usama Arife73686a2019-04-08 17:30:48 +0100664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100668 //3x3 Convolution of elements starting in 2nd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
Gian Marcoc799ed82018-02-01 16:57:48 +0000672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
Giorgio Arenad056e572020-10-12 11:53:51 +0100681 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000683}
684
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100686
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100687#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000688#if defined(CONV_STRIDE_X)
689#if CONV_STRIDE_X == 1
690#define convolution1x3_f16 convolution1x3_stride_1_f16
691#elif CONV_STRIDE_X == 2
692#define convolution1x3_f16 convolution1x3_stride_2_f16
693#elif CONV_STRIDE_X == 3
694#define convolution1x3_f16 convolution1x3_stride_3_f16
695#else /* CONV_STRIDE_X */
696#error "Stride not supported"
697#endif /* CONV_STRIDE_X */
698
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100699#if(DILATION_X > 1 || DILATION_Y > 1)
700
701/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
702 *
703 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
704 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
705 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
706 * @param[in] y_offset Offset from the source tensor from which to start convolution
707 * @param[in] weights_addr Pointer from where to get weights
708 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
709 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100710inline half4 convolution_3x3_dilation_stridex1_stridey1_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
711 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100712{
713 // Load the weights
714 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
715 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
716 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
717
718 half4 pixels0 = 0.0f;
719
720 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
721 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
722 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
723
724 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
725 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
726 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
727
728 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
729 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
730 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
731
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100732 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
733 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
734 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100735
736 return pixels0;
737}
738
739/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
740 *
741 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
742 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
743 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
744 * @param[in] y_offset Offset from the source tensor from which to start convolution
745 * @param[in] weights_addr Pointer from where to get weights
746 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
747 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100748inline half4 convolution_3x3_dilation_stridex2_stridey2_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
749 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100750{
751 // Load the weights
752 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
753 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
754 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
755
756 half4 pixels0 = 0.0f;
757
758 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
759 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
760 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
761
762 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
763 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
764 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
765
766 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
767 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
768 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
769
Giorgio Arenadcf4c872021-04-16 12:41:45 +0100770 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
771 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
772 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100773
774 return pixels0;
775}
776
777#endif // (DILATION_X > 1 && DILATION_Y > 1)
778
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000779/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
780 *
781 * @param[in] left_pixel Pointer to the left pixel.
782 * @param[in] left_coeff Weight of the left pixel
783 * @param[in] middle_coeff Weight of the middle pixel
784 * @param[in] right_coeff Weight of the right pixel
785 *
786 * @return a half4 containing 4 convoluted values.
787 */
788inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
789 const half left_coeff,
790 const half middle_coeff,
791 const half right_coeff)
792{
Usama Arife73686a2019-04-08 17:30:48 +0100793#if(DILATION_X == 1 && DILATION_Y == 1)
794
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000795 half8 temp = vload8(0, (__global half *)left_pixel);
796
797 half4 left = CONVERT(temp.s0123, half4);
798 half4 middle = CONVERT(temp.s1234, half4);
799 half4 right = CONVERT(temp.s2345, half4);
800
801 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100802#else /* DILATION_X==1 && DILATION_Y==1 */
803 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
804 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
805 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
806
807#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000808}
809
810/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
811 *
812 * @param[in] left_pixel Pointer to the left pixel.
813 * @param[in] left_coeff Weight of the left pixel
814 * @param[in] middle_coeff Weight of the middle pixel
815 * @param[in] right_coeff Weight of the right pixel
816 *
817 * @return a half4 containing 4 convoluted values.
818 */
819inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
820 const half left_coeff,
821 const half middle_coeff,
822 const half right_coeff)
823{
Usama Arife73686a2019-04-08 17:30:48 +0100824#if(DILATION_X == 1 && DILATION_Y == 1)
825
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000826 half8 temp0 = vload8(0, (__global half *)left_pixel);
827 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
828
829 half4 left = CONVERT(temp0.s0246, half4);
830 half4 middle = CONVERT(temp0.s1357, half4);
831 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
832
833 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100834#else /* DILATION_X==1 && DILATION_Y==1 */
835
836 __global half *left_pixel_float = (__global half *)left_pixel;
837
838 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
839 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
840 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
841
842#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000843}
844
845/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
846 *
847 * @param[in] left_pixel Pointer to the left pixel.
848 * @param[in] left_coeff Weight of the left pixel
849 * @param[in] middle_coeff Weight of the middle pixel
850 * @param[in] right_coeff Weight of the right pixel
851 *
852 * @return a half4 containing 4 convoluted values.
853 */
854inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
855 const half left_coeff,
856 const half middle_coeff,
857 const half right_coeff)
858{
Usama Arife73686a2019-04-08 17:30:48 +0100859#if(DILATION_X == 1 && DILATION_Y == 1)
860
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000861 half16 temp0 = vload16(0, (__global half *)left_pixel);
862
863 half4 left = CONVERT(temp0.s0369, half4);
864 half4 middle = CONVERT(temp0.s147A, half4);
865 half4 right = CONVERT(temp0.s258B, half4);
866
867 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100868#else /* DILATION_X==1 && DILATION_Y==1 */
869
870 __global half *left_pixel_float = (__global half *)left_pixel;
871
872 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
873 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
874 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
875
876#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000877}
878
879/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
880 *
881 * Convolution matrix layout:
882 *
883 * [ mat0, mat1, mat2 ]\n
884 * [ mat3, mat4, mat5 ]\n
885 * [ mat6, mat7, mat8 ]\n
886 *
887 * @param[in] src A pointer to source Image structure
888 * @param[in] mat0 Coefficient from the convolution matrix
889 * @param[in] mat1 Coefficient from the convolution matrix
890 * @param[in] mat2 Coefficient from the convolution matrix
891 * @param[in] mat3 Coefficient from the convolution matrix
892 * @param[in] mat4 Coefficient from the convolution matrix
893 * @param[in] mat5 Coefficient from the convolution matrix
894 * @param[in] mat6 Coefficient from the convolution matrix
895 * @param[in] mat0 Coefficient from the convolution matrix
896 * @param[in] mat7 Coefficient from the convolution matrix
897 * @param[in] mat8 Coefficient from the convolution matrix
898 *
899 * @return a half4 containing 4 convoluted values.
900 */
901inline half4 convolution3x3_f16(
Giorgio Arena15bc8482020-12-08 14:34:00 +0000902 __global uchar *src, uint src_stride_y,
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000903 const half mat0, const half mat1, const half mat2,
904 const half mat3, const half mat4, const half mat5,
905 const half mat6, const half mat7, const half mat8)
906{
907 half4 pixels;
908
Giorgio Arena15bc8482020-12-08 14:34:00 +0000909 pixels = convolution1x3_f16(src, mat0, mat1, mat2);
910 pixels += convolution1x3_f16(src + DILATION_Y * src_stride_y, mat3, mat4, mat5);
911 pixels += convolution1x3_f16(src + DILATION_Y * 2 * src_stride_y, mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000912
913 return pixels;
914}
915
Giorgio Arena76572242018-04-04 17:44:26 +0100916#if defined(DEPTH_MULTIPLIER)
917
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000918/** This OpenCL kernel computes the depthwise convolution 3x3
919 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100920 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100921 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
922 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
923 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100924 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000925 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
926 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000927 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000928 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000929 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000930 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
931 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000932 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000933 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000934 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
935 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
936 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
937 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
938 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
939 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
940 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000941 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000942 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
943 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
944 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
945 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
946 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
947 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
948 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100949 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000950 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
951 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
952 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
953 */
954__kernel void depthwise_convolution_3x3_f16(
955 TENSOR3D_DECLARATION(src),
956 TENSOR3D_DECLARATION(dst),
957 TENSOR3D_DECLARATION(weights)
958#if defined(HAS_BIAS)
959 ,
960 VECTOR_DECLARATION(biases)
961#endif //defined(HAS_BIAS)
962)
963{
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000964 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100965 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000966#if defined(HAS_BIAS)
967 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
968#endif //defined(HAS_BIAS)
969
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100970 // Extract channel and linearized batch indices
971 const int channel = get_global_id(2) % DST_CHANNELS;
972 const int batch = get_global_id(2) / DST_CHANNELS;
973 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Giorgio Arena15bc8482020-12-08 14:34:00 +0000974 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
975 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100976 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100977
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000978 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100979 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
980 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
981 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000982
Giorgio Arena15bc8482020-12-08 14:34:00 +0000983 half4 pixels = convolution3x3_f16(src_addr, src_stride_y, weights_values0.s0, weights_values0.s1, weights_values0.s2,
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000984 weights_values1.s0, weights_values1.s1, weights_values1.s2,
985 weights_values2.s0, weights_values2.s1, weights_values2.s2);
986#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100987 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000988#endif //defined(HAS_BIAS)
989
Giorgio Arenad056e572020-10-12 11:53:51 +0100990 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000991}
Giorgio Arena76572242018-04-04 17:44:26 +0100992#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000993#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000994
995/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
996 * when both stride_x and stride_y are equal to 1
997 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100998 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100999 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1000 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1001 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001002 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001003 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1004 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001005 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001006 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001007 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001008 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1009 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001010 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001011 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1012 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1013 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1014 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1015 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1016 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1017 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1018 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1019 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1020 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1021 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1022 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1023 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1024 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1025 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1026 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1027 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1028 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1029 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1030 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1031 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001032__kernel void depthwise_convolution_3x3_stridex1_stridey1_f16(
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001033 TENSOR3D_DECLARATION(src),
1034 TENSOR3D_DECLARATION(dst),
1035 TENSOR3D_DECLARATION(weights)
1036#if defined(HAS_BIAS)
1037 ,
1038 VECTOR_DECLARATION(biases)
1039#endif //defined(HAS_BIAS)
1040)
1041{
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001042 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001043 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1044
1045 // Extract channel and linearized batch indices
1046 const int channel = get_global_id(2) % DST_CHANNELS;
1047 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001048
1049#ifdef HAS_BIAS
1050 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1051
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001052 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001053#endif /* defined(HAS_BIAS) */
1054
1055 half4 pixels0 = 0.0f;
1056 half4 pixels1 = 0.0f;
1057 half4 pixels2 = 0.0f;
1058 half4 pixels3 = 0.0f;
1059
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001060 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1061 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena15bc8482020-12-08 14:34:00 +00001062 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1063 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001064
Usama Arife73686a2019-04-08 17:30:48 +01001065#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001066 // Load the weights
1067 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1068 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1069 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1070
1071 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1072 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1073 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1074 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1075 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1076 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1077 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1078
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001079 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00, weights_row0);
1080 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10, weights_row1);
1081 CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20, weights_row2);
1082 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src10, weights_row0);
1083 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src20, weights_row1);
1084 CONVOLUTION1x3_4X1_STRIDE1(pixels1, src30, weights_row2);
1085 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src20, weights_row0);
1086 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src30, weights_row1);
1087 CONVOLUTION1x3_4X1_STRIDE1(pixels2, src40, weights_row2);
1088 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src30, weights_row0);
1089 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src40, weights_row1);
1090 CONVOLUTION1x3_4X1_STRIDE1(pixels3, src50, weights_row2);
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001091
Usama Arife73686a2019-04-08 17:30:48 +01001092#else /* DILATION_X==1 && DILATION_Y==1 */
1093
1094 //3x3 Convolution of elements starting in 0th row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001095 pixels0 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001096 //3x3 Convolution of elements starting in 1st row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001097 pixels1 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001098 //3x3 Convolution of elements starting in 2nd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001099 pixels2 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001100 //3x3 Convolution of elements starting in 3rd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001101 pixels3 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001102
1103#endif /* DILATION_X==1 && DILATION_Y==1 */
1104
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001105#ifdef HAS_BIAS
1106 pixels0 += (half4)bias;
1107 pixels1 += (half4)bias;
1108 pixels2 += (half4)bias;
1109 pixels3 += (half4)bias;
1110#endif /* defined(HAS_BIAS) */
1111
Giorgio Arenad056e572020-10-12 11:53:51 +01001112 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1113 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1114 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1115 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001116}
1117
1118/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1119 * when both stride_x and stride_y are equal to 2
1120 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001121 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001122 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1123 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1124 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001125 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001126 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1127 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001128 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001129 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001130 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001131 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001132 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1133 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001134 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1135 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1136 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1137 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1138 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1139 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1140 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1141 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1142 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1143 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1144 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1145 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1146 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1147 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1148 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1149 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1150 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1151 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1152 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1153 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1154 */
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001155__kernel void depthwise_convolution_3x3_stridex2_stridey2_f16(
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001156 TENSOR3D_DECLARATION(src),
1157 TENSOR3D_DECLARATION(dst),
1158 TENSOR3D_DECLARATION(weights)
1159#if defined(HAS_BIAS)
1160 ,
1161 VECTOR_DECLARATION(biases)
1162#endif //defined(HAS_BIAS)
1163)
1164{
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001165 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001166 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1167
1168 // Extract channel and linearized batch indices
1169 const int channel = get_global_id(2) % DST_CHANNELS;
1170 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001171
1172#ifdef HAS_BIAS
1173 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1174
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001175 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001176#endif /* defined(HAS_BIAS) */
1177
1178 half4 pixels0 = 0.0f;
1179 half4 pixels1 = 0.0f;
1180
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001181 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1182 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena15bc8482020-12-08 14:34:00 +00001183 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1184 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001185
Usama Arife73686a2019-04-08 17:30:48 +01001186#if(DILATION_X == 1 && DILATION_Y == 1)
1187
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001188 // Load the weights
1189 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1190 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1191 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1192
1193 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1194 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1195 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1196 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1197 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1198 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1199 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1200 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1201 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1202 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1203 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1204
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001205 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1206 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1207 CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1208 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1209 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1210 CONVOLUTION1x3_4X1_STRIDE2(pixels1, src40, src41, weights_row2);
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001211
Usama Arife73686a2019-04-08 17:30:48 +01001212#else /* DILATION_X==1 && DILATION_Y==1 */
1213 //3x3 Convolution of elements starting in 0th row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001214 pixels0 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001215 //3x3 Convolution of elements starting in 2nd row
Giorgio Arenadcf4c872021-04-16 12:41:45 +01001216 pixels1 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001217#endif /* DILATION_X==1 && DILATION_Y==1 */
1218
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001219#ifdef HAS_BIAS
1220 pixels0 += (half4)bias;
1221 pixels1 += (half4)bias;
1222#endif /* defined(HAS_BIAS) */
1223
Giorgio Arenad056e572020-10-12 11:53:51 +01001224 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1225 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001226}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001227#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001228
Giorgio Arena79acd772020-10-22 14:29:50 +01001229#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001230/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1231 *
1232 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1233 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1234 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1235 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1236 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1237 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1238 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1239 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1240 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1241 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1242 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Giorgio Arena79acd772020-10-22 14:29:50 +01001243 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001244 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1245 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1246 *
1247 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1248 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1249 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1250 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1251 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1252 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1253 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1254 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1255 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1256 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1257 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1258 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1259 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1260 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1261 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1262 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1263 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1264 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1265 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1266 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1267 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1268 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1269 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1270 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1271 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1272 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1273 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1274 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1275 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1276 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1277 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1278 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1279 */
1280__kernel void dwc_MxN_native_fp_nhwc(
1281 TENSOR4D_DECLARATION(src),
1282 TENSOR4D_DECLARATION(dst),
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001283 TENSOR3D_DECLARATION(weights)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001284#if defined(HAS_BIAS)
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001285 ,
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001286 VECTOR_DECLARATION(biases)
1287#endif // defined(HAS_BIAS)
1288)
1289{
Giorgio Arena79acd772020-10-22 14:29:50 +01001290 int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);
1291
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001292 int x = get_global_id(0); // channels
1293 int y = get_global_id(1); // spatial coordinate x
1294#if defined(DST_DEPTH)
1295 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1296 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1297#else // defined(DST_DEPTH)
Giorgio Arena79acd772020-10-22 14:29:50 +01001298 int z = get_global_id(2); // spatial coordinate y
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001299#endif // defined(DST_DEPTH)
1300
Giorgio Arena79acd772020-10-22 14:29:50 +01001301 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001302
Giorgio Arena79acd772020-10-22 14:29:50 +01001303 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001304
Giorgio Arena79acd772020-10-22 14:29:50 +01001305 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001306
1307#if defined(HAS_BIAS)
Giorgio Arena79acd772020-10-22 14:29:50 +01001308 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001309#endif // defined(HAS_BIAS)
1310
1311#if defined(DST_DEPTH)
1312 s_addr += b * src_stride_w;
1313 d_addr += b * dst_stride_w;
1314#endif // defined(DST_DEPTH)
1315
1316 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1317 {
1318 // Each work-item computes N0x1x1 elements
1319 VEC_DATA_TYPE(DATA_TYPE, N0)
Giorgio Arena79acd772020-10-22 14:29:50 +01001320 res0 = 0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001321
1322 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1323 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1324
1325 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1326 {
1327 if(y_coord >= 0 && y_coord < SRC_DIM2)
1328 {
1329 int x_coord_tmp = x_coord;
1330
1331 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1332 {
1333 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1334 {
1335 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1336 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1337
1338 // Load input and weights values
1339 VEC_DATA_TYPE(DATA_TYPE, N0)
1340 i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
1341 VEC_DATA_TYPE(DATA_TYPE, N0)
1342 w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
1343
1344#if GPU_ARCH == GPU_ARCH_MIDGARD
Giorgio Arena79acd772020-10-22 14:29:50 +01001345 res0 += i * w;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001346#else // GPU_ARCH == GPU_ARCH_MIDGARD
Giorgio Arena79acd772020-10-22 14:29:50 +01001347 res0 = fma(i, w, res0);
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001348#endif // GPU_ARCH == GPU_ARCH_MIDGARD
1349 }
1350 x_coord_tmp += DILATION_X;
1351 }
1352 }
1353 y_coord += DILATION_Y;
1354 }
1355
1356#if defined(HAS_BIAS)
Giorgio Arena79acd772020-10-22 14:29:50 +01001357 res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001358#endif // defined(HAS_BIAS)
1359
Giorgio Arena79acd772020-10-22 14:29:50 +01001360 res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001361
Giorgio Arena79acd772020-10-22 14:29:50 +01001362 STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001363
1364 w_addr += sizeof(DATA_TYPE);
1365 d_addr += sizeof(DATA_TYPE);
1366#if defined(HAS_BIAS)
1367 b_addr += sizeof(DATA_TYPE);
1368#endif // defined(HAS_BIAS)
1369 }
1370}
Giorgio Arena79acd772020-10-22 14:29:50 +01001371#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001372
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001373#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001374
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001375#if DATA_TYPE != float || DATA_TYPE != half
1376#error "Unsupported data type"
1377#endif // DATA_TYPE != float || DATA_TYPE != half
1378
1379#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001380
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001381#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \
1382 ({ \
1383 basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \
1384 basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \
1385 basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001386 })
1387
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001388#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \
1389 ({ \
1390 FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \
1391 basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001392 })
1393
Giorgio Arenad051e972018-06-20 11:46:42 +01001394#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001395
Giorgio Arenad051e972018-06-20 11:46:42 +01001396/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1397 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001398 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001399 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1400 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1401 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1402 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1403 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1404 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001405 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Usama Arif6a98a6e2019-05-10 17:07:27 +01001406 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001407 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1408 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001409 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1410 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1411 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
Giorgio Arenad051e972018-06-20 11:46:42 +01001412 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001413 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001414 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001415 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001416 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001417 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001418 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001419 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1420 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1421 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1422 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001423 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1424 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1425 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1426 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1427 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1428 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1429 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001430 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1431 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001432 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001433 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001434 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1435 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1436 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1437 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1438 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1439 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1440 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1441 * @param[in] max_offset Max offset for the input tensor
1442 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1443 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1444 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1445 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1446 */
1447__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001448 TENSOR4D_DECLARATION(src),
1449 TENSOR4D_DECLARATION(dst),
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001450 TENSOR3D_DECLARATION(weights)
Giorgio Arenad051e972018-06-20 11:46:42 +01001451#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001452 ,
1453 VECTOR_DECLARATION(biases)
Giorgio Arenad051e972018-06-20 11:46:42 +01001454#endif /* defined(HAS_BIAS) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001455)
Giorgio Arenad051e972018-06-20 11:46:42 +01001456{
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001457 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1458 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001459#if defined(DST_DEPTH)
1460 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1461 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001462#else // defined(DST_DEPTH)
1463 int z = get_global_id(2); // spatial coordinate y
1464#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001465
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001466 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001467
Georgios Pinitas37044642018-10-30 14:53:25 +00001468#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001469 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001470#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001471 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
Georgios Pinitas37044642018-10-30 14:53:25 +00001472#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001473
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001474 int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
1475 int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
Giorgio Arenad051e972018-06-20 11:46:42 +01001476
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001477 int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
1478 int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
1479
1480 // Use these vectors to check whether the unclamped load would have been out of bounds
1481 src_coord_y = (src_offset_y != src_coord_y);
1482 src_coord_z = (src_offset_z != src_coord_z);
1483
1484 src_offset_y *= (int3)src_stride_y;
1485 src_offset_z *= (int3)src_stride_z;
1486
1487 // We compute VEC_SIZEx1x1 [C,W,H] elements
1488 VEC_FLOAT acc0 = 0;
Giorgio Arenad051e972018-06-20 11:46:42 +01001489
1490 // Load weights
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001491 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1492 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1493 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1494 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1495 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1496 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1497 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1498 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1499 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001500
1501 // Load input values
1502 // z == 0
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001503 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1504 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1505 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001506
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001507 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
1508
1509 acc0 = fma(values0, w0, acc0);
1510 acc0 = fma(values1, w1, acc0);
1511 acc0 = fma(values2, w2, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001512
1513 // z == 1
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001514 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1515 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1516 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1517
1518 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
1519
1520 acc0 = fma(values0, w3, acc0);
1521 acc0 = fma(values1, w4, acc0);
1522 acc0 = fma(values2, w5, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001523
1524 // z == 2
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001525 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1526 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1527 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001528
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001529 FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
Giorgio Arenad051e972018-06-20 11:46:42 +01001530
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001531 acc0 = fma(values0, w6, acc0);
1532 acc0 = fma(values1, w7, acc0);
1533 acc0 = fma(values2, w8, acc0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001534
1535#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001536 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1537 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
1538 acc0 += bias_values;
Giorgio Arenad051e972018-06-20 11:46:42 +01001539#endif // defined(HAS_BIAS)
1540
Georgios Pinitas37044642018-10-30 14:53:25 +00001541#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001542 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001543#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001544 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001545#endif /* defined(DST_DEPTH) */
1546
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001547 acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1548 STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001549}
1550#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1551
1552#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1553/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1554 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001555 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001556 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1557 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1558 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1559 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1560 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1561 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001562 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001563 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1564 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001565 * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
1566 * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
1567 * @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)
1568 * @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)
1569 * @note In case of biases, -DHAS_BIAS must to be passed at compile
1570 * @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)
Giorgio Arenad051e972018-06-20 11:46:42 +01001571 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001572 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001573 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001574 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001575 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001576 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001577 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001578 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1579 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1580 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1581 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001582 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1583 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1584 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1585 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1586 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1587 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1588 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001589 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1590 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001591 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001592 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001593 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1594 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1595 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1596 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1597 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1598 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1599 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1600 * @param[in] max_offset Max offset for the input tensor
1601 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1602 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1603 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1604 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1605 */
1606__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001607 TENSOR4D_DECLARATION(src),
1608 TENSOR4D_DECLARATION(dst),
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001609 TENSOR3D_DECLARATION(weights)
Giorgio Arenad051e972018-06-20 11:46:42 +01001610#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001611 ,
1612 VECTOR_DECLARATION(biases)
Giorgio Arenad051e972018-06-20 11:46:42 +01001613#endif /* defined(HAS_BIAS) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001614)
Giorgio Arenad051e972018-06-20 11:46:42 +01001615{
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001616 int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
1617 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001618#if defined(DST_DEPTH)
1619 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1620 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001621#else // defined(DST_DEPTH)
1622 int z = get_global_id(2); // spatial coordinate y
1623#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001624
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001625 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001626
Georgios Pinitas37044642018-10-30 14:53:25 +00001627#if defined(DST_DEPTH)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001628 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
Georgios Pinitas37044642018-10-30 14:53:25 +00001629#else /* defined(DST_DEPTH) */
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001630 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
Georgios Pinitas37044642018-10-30 14:53:25 +00001631#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001632
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001633 int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);
1634 int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);
Giorgio Arenad051e972018-06-20 11:46:42 +01001635
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001636 int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
1637 int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
1638
1639 // Use these vectors to check whether the unclamped load would have been out of bounds
1640 src_coord_y = (src_offset_y != src_coord_y);
1641 src_coord_z = (src_offset_z != src_coord_z);
1642
1643 src_offset_y *= (int4)src_stride_y;
1644 src_offset_z *= (int4)src_stride_z;
1645
1646 // We compute VEC_SIZEx2x2 [C,W,H] elements
Giorgio Arenad051e972018-06-20 11:46:42 +01001647 VEC_FLOAT acc0 = 0;
1648 VEC_FLOAT acc1 = 0;
1649 VEC_FLOAT acc2 = 0;
1650 VEC_FLOAT acc3 = 0;
1651
1652 // Load weights
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001653 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));
1654 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));
1655 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));
1656 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));
1657 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));
1658 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));
1659 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));
1660 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));
1661 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001662
1663 // Load input values
1664 // z == 0
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001665 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));
1666 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));
1667 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));
1668 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001669
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001670 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
Giorgio Arenad051e972018-06-20 11:46:42 +01001671
1672 acc0 = fma(values0, w0, acc0);
1673 acc0 = fma(values1, w1, acc0);
1674 acc0 = fma(values2, w2, acc0);
1675 acc1 = fma(values1, w0, acc1);
1676 acc1 = fma(values2, w1, acc1);
1677 acc1 = fma(values3, w2, acc1);
1678
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001679 // z == 1
1680 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1681 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1682 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1683 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001684
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001685 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
Giorgio Arenad051e972018-06-20 11:46:42 +01001686
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001687 acc0 = fma(values0, w3, acc0);
1688 acc0 = fma(values1, w4, acc0);
1689 acc0 = fma(values2, w5, acc0);
1690 acc1 = fma(values1, w3, acc1);
1691 acc1 = fma(values2, w4, acc1);
1692 acc1 = fma(values3, w5, acc1);
Giorgio Arenad051e972018-06-20 11:46:42 +01001693
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001694 acc2 = fma(values0, w0, acc2);
1695 acc2 = fma(values1, w1, acc2);
1696 acc2 = fma(values2, w2, acc2);
1697 acc3 = fma(values1, w0, acc3);
1698 acc3 = fma(values2, w1, acc3);
1699 acc3 = fma(values3, w2, acc3);
Giorgio Arenad051e972018-06-20 11:46:42 +01001700
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001701 // z == 2
1702 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1703 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1704 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1705 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
1706
1707 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
1708
1709 acc0 = fma(values0, w6, acc0);
1710 acc0 = fma(values1, w7, acc0);
1711 acc0 = fma(values2, w8, acc0);
1712 acc1 = fma(values1, w6, acc1);
1713 acc1 = fma(values2, w7, acc1);
1714 acc1 = fma(values3, w8, acc1);
1715
1716 acc2 = fma(values0, w3, acc2);
1717 acc2 = fma(values1, w4, acc2);
1718 acc2 = fma(values2, w5, acc2);
1719 acc3 = fma(values1, w3, acc3);
1720 acc3 = fma(values2, w4, acc3);
1721 acc3 = fma(values3, w5, acc3);
1722
1723 // z == 3
1724 values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
1725 values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
1726 values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
1727 values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
1728
1729 FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
1730
1731 acc2 = fma(values0, w6, acc2);
1732 acc2 = fma(values1, w7, acc2);
1733 acc2 = fma(values2, w8, acc2);
1734 acc3 = fma(values1, w6, acc3);
1735 acc3 = fma(values2, w7, acc3);
1736 acc3 = fma(values3, w8, acc3);
Giorgio Arenad051e972018-06-20 11:46:42 +01001737
1738#if defined(HAS_BIAS)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001739 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
Giorgio Arenad051e972018-06-20 11:46:42 +01001740
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001741 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001742
1743 acc0 += bias_values;
1744 acc1 += bias_values;
1745 acc2 += bias_values;
1746 acc3 += bias_values;
1747#endif // defined(HAS_BIAS)
1748
Giorgio Arena2d1a8352020-10-26 15:04:08 +00001749 int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001750 int dst_coord_z = z * NUM_PLANES_PROCESSED;
Giorgio Arenad051e972018-06-20 11:46:42 +01001751
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001752#if defined(DST_DEPTH)
1753 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
1754#else // defined(DST_DEPTH)
1755 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
1756#endif // defined(DST_DEPTH)
1757
1758 /* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.
1759 * If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.
1760 * Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.
1761 */
1762 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);
1763 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1764
1765 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);
1766 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001767
1768#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001769 if((dst_coord_z + 1) < DST_DIM_2)
Giorgio Arenad051e972018-06-20 11:46:42 +01001770#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1771 {
Giorgio Arena1e2af2a2020-10-15 17:39:41 +01001772 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);
1773 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
1774
1775 values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);
1776 STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)
Giorgio Arenad051e972018-06-20 11:46:42 +01001777 }
1778}
1779
1780#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +01001781#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)