blob: fb4a0fc157b79616a356562109ccf5f576f9dfdd [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Usama Arif6a98a6e2019-05-10 17:07:27 +010027#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010028
29/** Get the pointer position at a certain offset in x and y direction.
30 *
31 * @param[in] ptr Pointer to the starting position of the buffer
32 * @param[in] x Relative X position
33 * @param[in] y Relative Y position
34 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
35 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
36 *
37 * @return a uchar
38 */
39inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
40{
41 return ptr + x * stride_x + y * stride_y;
42}
43
44#if(DILATION_X == 1 && DILATION_Y == 1)
45
46#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
47 ({ \
48 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
49 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
50 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
51 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
52 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
53 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
54 })
55
56#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
57 ({ \
58 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
59 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
60 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
61 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
62 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
63 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
64 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
65 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
66 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
67 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
68 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
69 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
70 })
71
72#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
73 ({ \
74 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
75 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
76 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
77 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
78 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
79 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
80 })
81
82#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
83 ({ \
84 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
85 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
86 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
87 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
88 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
89 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
90 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
91 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
92 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
93 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
94 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
95 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
96 })
97
98#else /* DILATION_X==1 && DILATION_Y==1 */
99
100#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
101 ({ \
102 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
103 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
104 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
105 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
106 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
107 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
108 })
109
110#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
111 ({ \
112 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
113 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
114 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
115 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
116 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
117 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
118 })
119
120#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
121 ({ \
122 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
123 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
124 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
125 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
126 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
127 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
128 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
129 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
130 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
131 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
132 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
133 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
134 })
135
136#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
137 ({ \
138 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
139 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
140 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
141 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
142 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
143 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
144 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
145 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
146 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
147 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
148 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
149 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
150 })
151
152#endif /* DILATION_X==1 && DILATION_Y==1 */
153
154#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100155#if defined(CONV_STRIDE_X)
156
Giorgio Arena93a690e2017-08-01 16:09:33 +0100157#if CONV_STRIDE_X == 1
158#define convolution1x3 convolution1x3_stride_1
159#elif CONV_STRIDE_X == 2
160#define convolution1x3 convolution1x3_stride_2
161#elif CONV_STRIDE_X == 3
162#define convolution1x3 convolution1x3_stride_3
163#else /* CONV_STRIDE_X */
164#error "Stride not supported"
165#endif /* CONV_STRIDE_X */
166
167/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
168 *
169 * @param[in] left_pixel Pointer to the left pixel.
170 * @param[in] left_coeff Weight of the left pixel
171 * @param[in] middle_coeff Weight of the middle pixel
172 * @param[in] right_coeff Weight of the right pixel
173 *
174 * @return a float2 containing 2 convoluted values.
175 */
176inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
177 const float left_coeff,
178 const float middle_coeff,
179 const float right_coeff)
180{
Usama Arife73686a2019-04-08 17:30:48 +0100181#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100182 float4 temp = vload4(0, (__global float *)left_pixel);
183
184 float2 left = CONVERT(temp.s01, float2);
185 float2 middle = CONVERT(temp.s12, float2);
186 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100187 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100188#else /* DILATION_X==1 && DILATION_Y==1 */
189 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
190 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
191 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
192#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100193}
194
195/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
196 *
197 * @param[in] left_pixel Pointer to the left pixel.
198 * @param[in] left_coeff Weight of the left pixel
199 * @param[in] middle_coeff Weight of the middle pixel
200 * @param[in] right_coeff Weight of the right pixel
201 *
202 * @return a float2 containing 2 convoluted values.
203 */
204inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
205 const float left_coeff,
206 const float middle_coeff,
207 const float right_coeff)
208{
Usama Arife73686a2019-04-08 17:30:48 +0100209#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100210 float4 temp0 = vload4(0, (__global float *)left_pixel);
211 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
212
213 float2 left = CONVERT(temp0.s02, float2);
214 float2 middle = CONVERT(temp0.s13, float2);
215 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
216
217 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100218#else /* DILATION_X==1 && DILATION_Y==1 */
219 __global float *left_pixel_float = (__global float *)left_pixel;
220
221 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
222 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
223 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
224
225#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100226}
227
228/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
229 *
230 * @param[in] left_pixel Pointer to the left pixel.
231 * @param[in] left_coeff Weight of the left pixel
232 * @param[in] middle_coeff Weight of the middle pixel
233 * @param[in] right_coeff Weight of the right pixel
234 *
235 * @return a float2 containing 2 convoluted values.
236 */
237inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
238 const float left_coeff,
239 const float middle_coeff,
240 const float right_coeff)
241{
Usama Arife73686a2019-04-08 17:30:48 +0100242#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100243 float4 temp0 = vload4(0, (__global float *)left_pixel);
244 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
245
246 float2 left = CONVERT(temp0.s03, float2);
247 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
248 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
249
250 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100251#else /* DILATION_X==1 && DILATION_Y==1 */
252 __global float *left_pixel_float = (__global float *)left_pixel;
253
254 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
255 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
256 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
257#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100258}
259
260/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
261 *
262 * Convolution matrix layout:
263 *
264 * [ mat0, mat1, mat2 ]\n
265 * [ mat3, mat4, mat5 ]\n
266 * [ mat6, mat7, mat8 ]\n
267 *
268 * @param[in] src A pointer to source Image structure
269 * @param[in] mat0 Coefficient from the convolution matrix
270 * @param[in] mat1 Coefficient from the convolution matrix
271 * @param[in] mat2 Coefficient from the convolution matrix
272 * @param[in] mat3 Coefficient from the convolution matrix
273 * @param[in] mat4 Coefficient from the convolution matrix
274 * @param[in] mat5 Coefficient from the convolution matrix
275 * @param[in] mat6 Coefficient from the convolution matrix
276 * @param[in] mat0 Coefficient from the convolution matrix
277 * @param[in] mat7 Coefficient from the convolution matrix
278 * @param[in] mat8 Coefficient from the convolution matrix
279 *
280 * @return a float2 containing 2 convoluted values.
281 */
282inline float2 convolution3x3(
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100283 __global const uchar *src,
284 unsigned int src_stride_y,
Giorgio Arena93a690e2017-08-01 16:09:33 +0100285 const float mat0, const float mat1, const float mat2,
286 const float mat3, const float mat4, const float mat5,
287 const float mat6, const float mat7, const float mat8)
288{
289 float2 pixels;
290
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100291 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
292 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
293 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100294
295 return pixels;
296}
297
Gian Marcoc799ed82018-02-01 16:57:48 +0000298/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000299 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100300 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
301 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
302 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000303 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
304 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000305 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000306 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000307 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000308 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000309 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
310 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000311 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000312 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
313 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
314 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
315 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
316 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
317 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
318 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000319 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000320 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
321 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
322 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
323 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
324 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
325 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
326 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
327 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
328 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
329 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
330 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
331 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100332__kernel void depthwise_convolution_3x3(
333 TENSOR3D_DECLARATION(src),
334 TENSOR3D_DECLARATION(dst),
335 TENSOR3D_DECLARATION(weights)
336#if defined(HAS_BIAS)
337 ,
338 VECTOR_DECLARATION(biases)
339#endif //defined(HAS_BIAS)
340)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100341{
342 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
343 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100344 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100345
346 float2 pixels = 0.0f;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100347
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100348 // Extract channel and linearized batch indices
349 const int channel = get_global_id(2) % DST_CHANNELS;
350 const int batch = get_global_id(2) / DST_CHANNELS;
351 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100352
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100353 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100354
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100355 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100356
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100357 // Load the weights
358 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
359 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
360 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
361
362 pixels = convolution3x3(src_addr, src_stride_y,
363 weights_values0.s0, weights_values0.s1, weights_values0.s2,
364 weights_values1.s0, weights_values1.s1, weights_values1.s2,
365 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100366#if defined(HAS_BIAS)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100367 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
368
369 float bias = *((__global float *)(vector_offset(&biases, channel)));
370
371 pixels += (float2)bias;
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100372#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100373
Usama Arif6a98a6e2019-05-10 17:07:27 +0100374 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100375}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100376#endif //defined(CONV_STRIDE_X)
377
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100378#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100379
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100380/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100381 *
382 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
383 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
384 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
385 * @param[in] y_offset Offset from the source tensor from which to start convolution
386 * @param[in] weights_addr Pointer from where to get weights
387 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
388 */
389inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
390 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
391{
392 // Load the weights
393 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
394 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
395 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
396
397 float2 pixels0 = 0.0f;
398
399 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
400 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
402
403 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
404 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
406
407 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
408 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
410
411 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
412 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
413 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
414
415 return pixels0;
416}
417
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100418/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100419 *
420 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
421 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
422 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
423 * @param[in] y_offset Offset from the source tensor from which to start convolution
424 * @param[in] weights_addr Pointer from where to get weights
425 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
426 */
427inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
428 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
429{
430 // Load the weights
431 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
432 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
433 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
434
435 float2 pixels0 = 0.0f;
436
437 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
438 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
440
441 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
442 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
444
445 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
446 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
448
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
451 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
452
453 return pixels0;
454}
455
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100456#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100457
Gian Marcoc799ed82018-02-01 16:57:48 +0000458/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
459 * stride_x and stride_y are equal to 1
460 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100461 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100462 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
463 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
464 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100465 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000466 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
467 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000468 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000469 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000470 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000471 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000472 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
473 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
474 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
475 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
476 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
477 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
478 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
479 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
480 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
481 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
482 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
483 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
484 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
485 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
486 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
487 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
488 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
489 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
490 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
491 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
492 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
493 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
494 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000495__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000496 TENSOR3D_DECLARATION(src),
497 TENSOR3D_DECLARATION(dst),
498 TENSOR3D_DECLARATION(weights)
499#if defined(HAS_BIAS)
500 ,
501 VECTOR_DECLARATION(biases)
502#endif //defined(HAS_BIAS)
503)
504{
505 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
506 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100507 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000508
509 float2 pixels0 = 0.0f;
510 float2 pixels1 = 0.0f;
511 float2 pixels2 = 0.0f;
512 float2 pixels3 = 0.0f;
513
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100514 // Extract channel and linearized batch indices
515 const int channel = get_global_id(2) % DST_CHANNELS;
516 const int batch = get_global_id(2) / DST_CHANNELS;
517 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
518 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
519 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000520
Usama Arife73686a2019-04-08 17:30:48 +0100521#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000522 // Load the weights
523 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
524 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
525 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
526
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000527 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000528 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
529 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
530 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
531 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000532 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
533 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000534
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
540 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
541 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
542 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
543 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
544 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
545 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
546 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
547
Usama Arife73686a2019-04-08 17:30:48 +0100548#else /* DILATION_X==1 && DILATION_Y==1 */
549
550 //3x3 Convolution of elements starting in 0th row
551 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
552 //3x3 Convolution of elements starting in 1st row
553 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
554 //3x3 Convolution of elements starting in 2nd row
555 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
556 //3x3 Convolution of elements starting in 3rd row
557 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
558
559#endif /* DILATION_X==1 && DILATION_Y==1 */
560
Gian Marcoc799ed82018-02-01 16:57:48 +0000561#ifdef HAS_BIAS
562 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
563
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100564 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000565
566 pixels0 += (float2)bias;
567 pixels1 += (float2)bias;
568 pixels2 += (float2)bias;
569 pixels3 += (float2)bias;
570#endif /* defined(HAS_BIAS) */
571
Usama Arif6a98a6e2019-05-10 17:07:27 +0100572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
575 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000576}
577
578/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
579 * stride_x and stride_y are equal to 2
580 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100581 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100582 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
583 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
584 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100585 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000586 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
587 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000588 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000589 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000590 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000591 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000592 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
593 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
594 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
595 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
596 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
597 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
598 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
599 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
600 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
601 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
602 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
603 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
604 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
605 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
606 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
607 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
608 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
609 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
610 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
611 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
612 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
613 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
614 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000615__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000616 TENSOR3D_DECLARATION(src),
617 TENSOR3D_DECLARATION(dst),
618 TENSOR3D_DECLARATION(weights)
619#if defined(HAS_BIAS)
620 ,
621 VECTOR_DECLARATION(biases)
622#endif //defined(HAS_BIAS)
623)
624{
625 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
626 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100627 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000628
629 float2 pixels0 = 0.0f;
630 float2 pixels1 = 0.0f;
631
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100632 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000633 const int channel = get_global_id(2) % DST_CHANNELS;
634 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100635 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
636 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
637 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000638
Usama Arife73686a2019-04-08 17:30:48 +0100639#if(DILATION_X == 1 && DILATION_Y == 1)
640
Gian Marcoc799ed82018-02-01 16:57:48 +0000641 // Load the weights
642 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
643 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
644 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
645
646 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
647 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
649 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
651 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
653 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
655 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
657
658 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
659 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
660 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
661 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
662 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
663 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
664
Usama Arife73686a2019-04-08 17:30:48 +0100665#else /* DILATION_X==1 && DILATION_Y==1 */
666
667 //3x3 Convolution of elements starting in 0th row
668 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
669 //3x3 Convolution of elements starting in 2nd row
670 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
671#endif /* DILATION_X==1 && DILATION_Y==1 */
672
Gian Marcoc799ed82018-02-01 16:57:48 +0000673#ifdef HAS_BIAS
674 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
675
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100676 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000677
678 pixels0 += (float2)bias;
679 pixels1 += (float2)bias;
680#endif /* defined(HAS_BIAS) */
681
Usama Arif6a98a6e2019-05-10 17:07:27 +0100682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
683 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000684}
685
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100686#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100687
giuros016d109962019-01-07 17:47:19 +0000688#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
689/** Reshape the weights for quantized depthwise convolution
690 *
691 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
692 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
693 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
694 * @attention Input's height and width should be 3
695 *
696 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
697 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
698 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
699 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
700 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
701 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
702 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
703 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
704 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
705 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
706 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
707 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
708 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
709 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
710 */
711__kernel void depthwise_convolution_reshape_weights(
712 TENSOR3D_DECLARATION(src),
713 IMAGE_DECLARATION(dst))
714{
715 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
716 const int x = get_global_id(0);
717
718 // Load 3x3xVEC_SIZE weights
719 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
720 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
721 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
722 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
723 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
724 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
725 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
726 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
727 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
728 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
729 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
730 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
731 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
732 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
733 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
734 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
735 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
736 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
737
738 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
739
740#if defined(TRANSPOSE)
741#if VEC_SIZE != 4
742#error "VEC_SIZE not supported"
743#else // VEC_SIZE != 4
744 VSTORE(VEC_SIZE)
745 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
746 VSTORE(VEC_SIZE)
747 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
748 VSTORE(VEC_SIZE)
749 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
750 VSTORE(VEC_SIZE)
751 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
752 VSTORE(VEC_SIZE)
753 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
754 VSTORE(VEC_SIZE)
755 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
756 VSTORE(VEC_SIZE)
757 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
758 VSTORE(VEC_SIZE)
759 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
760 VSTORE(VEC_SIZE)
761 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
762#endif // VEC_SIZE != 4
763#else // !defined(TRANSPOSE)
764 VSTORE(VEC_SIZE)
765 (w0, 0, dst_addr + 0);
766 VSTORE(VEC_SIZE)
767 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
768 VSTORE(VEC_SIZE)
769 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
770 VSTORE(VEC_SIZE)
771 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
772 VSTORE(VEC_SIZE)
773 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
774 VSTORE(VEC_SIZE)
775 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
776 VSTORE(VEC_SIZE)
777 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
778 VSTORE(VEC_SIZE)
779 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
780 VSTORE(VEC_SIZE)
781 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
782#endif // defined(TRANSPOSE)
783}
784#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
785
Giorgio Arenad051e972018-06-20 11:46:42 +0100786#if defined(NCHW)
787#define in_stride_x src_stride_x
788#define in_stride_y src_stride_y
789#define in_stride_z src_stride_z
790#define out_stride_x dst_stride_x
791#define out_stride_y dst_stride_y
792#define out_stride_z dst_stride_z
793#else //defined(NCHW)
794#define in_stride_x src_stride_y
795#define in_stride_y src_stride_z
796#define in_stride_z src_stride_x
797#define out_stride_x dst_stride_y
798#define out_stride_y dst_stride_z
799#define out_stride_z dst_stride_x
800#endif //defined(NCHW)
801
Giorgio Arena9fe41442017-08-23 16:36:24 +0100802#if defined(SRC_WIDTH) && defined(DATA_TYPE)
803/** This kernel reshapes each of the tensor's low three dimensions to single rows.
804 *
805 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
806 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100807 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
808 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
809 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
810 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
811 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
812 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
813 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
814 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
815 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
816 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
817 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
818 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
819 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
820 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
821 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
822 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
823 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
824 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100825 */
giuros016d109962019-01-07 17:47:19 +0000826__kernel void depthwise_convolution_reshape_weights_generic(
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100827 TENSOR3D_DECLARATION(src),
828 IMAGE_DECLARATION(dst)
829#ifdef HAS_BIAS
830 ,
831 VECTOR_DECLARATION(biases)
832#endif /* HAS_BIAS */
833)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100834{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100835#ifdef HAS_BIAS
836 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
837#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100838
Giorgio Arenad051e972018-06-20 11:46:42 +0100839 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
840 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100841
Giorgio Arenad051e972018-06-20 11:46:42 +0100842 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100843 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100844 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100845 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100846
847#if defined(HAS_BIAS)
848 if(get_global_id(1) == 0)
849 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100850 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100851 }
852#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100853}
854#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
855
Usama Arife73686a2019-04-08 17:30:48 +0100856#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100857/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
858 *
859 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100860 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Usama Arife73686a2019-04-08 17:30:48 +0100861 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Giorgio Arena9fe41442017-08-23 16:36:24 +0100862 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100863 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100864 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
865 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
866 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
867 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
868 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
869 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
870 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
871 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
872 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
873 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
874 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
875 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
876 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
877 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
878 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
879 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100880__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
881{
882 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
883
884 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100885 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Usama Arife73686a2019-04-08 17:30:48 +0100886 const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100887
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100888 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
889 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100890 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100891
Giorgio Arenad051e972018-06-20 11:46:42 +0100892 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100893 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
894
Usama Arife73686a2019-04-08 17:30:48 +0100895 for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100896 {
Usama Arife73686a2019-04-08 17:30:48 +0100897 for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100898 {
899 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
900 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000901 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100902 }
903 else
904 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100905 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100906 }
907 }
908 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100909#if defined(HAS_BIAS)
910 *output_ptr = (DATA_TYPE)(1);
911#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100912}
913
Giorgio Arena76572242018-04-04 17:44:26 +0100914#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100915
916#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
917
918/** This kernel performs a reshaping of the output of the depthwise generic convolution.
919 *
920 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
921 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
922 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100923 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100924 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
925 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
926 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
927 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
928 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
929 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
930 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
931 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
932 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
933 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
934 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
935 */
936__kernel void depthwise_vector_to_tensor(
937 VECTOR_DECLARATION(src),
938 TENSOR3D_DECLARATION(dst))
939{
940 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
941
942 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
943 const int id0 = get_global_id(0);
944 const int z = id0 / patch_size;
945 const int index2D = id0 - z * patch_size;
946
Giorgio Arenad051e972018-06-20 11:46:42 +0100947 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100948 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
949}
950
951#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000952
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100953#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000954#if defined(CONV_STRIDE_X)
955#if CONV_STRIDE_X == 1
956#define convolution1x3_f16 convolution1x3_stride_1_f16
957#elif CONV_STRIDE_X == 2
958#define convolution1x3_f16 convolution1x3_stride_2_f16
959#elif CONV_STRIDE_X == 3
960#define convolution1x3_f16 convolution1x3_stride_3_f16
961#else /* CONV_STRIDE_X */
962#error "Stride not supported"
963#endif /* CONV_STRIDE_X */
964
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100965#if(DILATION_X > 1 || DILATION_Y > 1)
966
967/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
968 *
969 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
970 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
971 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
972 * @param[in] y_offset Offset from the source tensor from which to start convolution
973 * @param[in] weights_addr Pointer from where to get weights
974 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
975 */
976inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
977 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
978{
979 // Load the weights
980 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
981 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
982 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
983
984 half4 pixels0 = 0.0f;
985
986 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
987 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
988 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
989
990 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
991 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
992 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
993
994 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
995 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
996 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
997
998 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
999 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
1000 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1001
1002 return pixels0;
1003}
1004
1005/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
1006 *
1007 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
1008 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1009 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1010 * @param[in] y_offset Offset from the source tensor from which to start convolution
1011 * @param[in] weights_addr Pointer from where to get weights
1012 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
1013 */
1014inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
1015 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
1016{
1017 // Load the weights
1018 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1019 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1020 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1021
1022 half4 pixels0 = 0.0f;
1023
1024 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
1025 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1026 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1027
1028 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
1029 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1030 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1031
1032 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
1033 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1034 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1035
1036 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
1037 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
1038 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1039
1040 return pixels0;
1041}
1042
1043#endif // (DILATION_X > 1 && DILATION_Y > 1)
1044
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001045/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
1046 *
1047 * @param[in] left_pixel Pointer to the left pixel.
1048 * @param[in] left_coeff Weight of the left pixel
1049 * @param[in] middle_coeff Weight of the middle pixel
1050 * @param[in] right_coeff Weight of the right pixel
1051 *
1052 * @return a half4 containing 4 convoluted values.
1053 */
1054inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
1055 const half left_coeff,
1056 const half middle_coeff,
1057 const half right_coeff)
1058{
Usama Arife73686a2019-04-08 17:30:48 +01001059#if(DILATION_X == 1 && DILATION_Y == 1)
1060
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001061 half8 temp = vload8(0, (__global half *)left_pixel);
1062
1063 half4 left = CONVERT(temp.s0123, half4);
1064 half4 middle = CONVERT(temp.s1234, half4);
1065 half4 right = CONVERT(temp.s2345, half4);
1066
1067 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001068#else /* DILATION_X==1 && DILATION_Y==1 */
1069 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
1070 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
1071 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
1072
1073#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001074}
1075
1076/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
1077 *
1078 * @param[in] left_pixel Pointer to the left pixel.
1079 * @param[in] left_coeff Weight of the left pixel
1080 * @param[in] middle_coeff Weight of the middle pixel
1081 * @param[in] right_coeff Weight of the right pixel
1082 *
1083 * @return a half4 containing 4 convoluted values.
1084 */
1085inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
1086 const half left_coeff,
1087 const half middle_coeff,
1088 const half right_coeff)
1089{
Usama Arife73686a2019-04-08 17:30:48 +01001090#if(DILATION_X == 1 && DILATION_Y == 1)
1091
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001092 half8 temp0 = vload8(0, (__global half *)left_pixel);
1093 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
1094
1095 half4 left = CONVERT(temp0.s0246, half4);
1096 half4 middle = CONVERT(temp0.s1357, half4);
1097 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
1098
1099 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001100#else /* DILATION_X==1 && DILATION_Y==1 */
1101
1102 __global half *left_pixel_float = (__global half *)left_pixel;
1103
1104 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
1105 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
1106 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
1107
1108#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001109}
1110
1111/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
1112 *
1113 * @param[in] left_pixel Pointer to the left pixel.
1114 * @param[in] left_coeff Weight of the left pixel
1115 * @param[in] middle_coeff Weight of the middle pixel
1116 * @param[in] right_coeff Weight of the right pixel
1117 *
1118 * @return a half4 containing 4 convoluted values.
1119 */
1120inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
1121 const half left_coeff,
1122 const half middle_coeff,
1123 const half right_coeff)
1124{
Usama Arife73686a2019-04-08 17:30:48 +01001125#if(DILATION_X == 1 && DILATION_Y == 1)
1126
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001127 half16 temp0 = vload16(0, (__global half *)left_pixel);
1128
1129 half4 left = CONVERT(temp0.s0369, half4);
1130 half4 middle = CONVERT(temp0.s147A, half4);
1131 half4 right = CONVERT(temp0.s258B, half4);
1132
1133 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001134#else /* DILATION_X==1 && DILATION_Y==1 */
1135
1136 __global half *left_pixel_float = (__global half *)left_pixel;
1137
1138 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
1139 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
1140 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
1141
1142#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001143}
1144
1145/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
1146 *
1147 * Convolution matrix layout:
1148 *
1149 * [ mat0, mat1, mat2 ]\n
1150 * [ mat3, mat4, mat5 ]\n
1151 * [ mat6, mat7, mat8 ]\n
1152 *
1153 * @param[in] src A pointer to source Image structure
1154 * @param[in] mat0 Coefficient from the convolution matrix
1155 * @param[in] mat1 Coefficient from the convolution matrix
1156 * @param[in] mat2 Coefficient from the convolution matrix
1157 * @param[in] mat3 Coefficient from the convolution matrix
1158 * @param[in] mat4 Coefficient from the convolution matrix
1159 * @param[in] mat5 Coefficient from the convolution matrix
1160 * @param[in] mat6 Coefficient from the convolution matrix
1161 * @param[in] mat0 Coefficient from the convolution matrix
1162 * @param[in] mat7 Coefficient from the convolution matrix
1163 * @param[in] mat8 Coefficient from the convolution matrix
1164 *
1165 * @return a half4 containing 4 convoluted values.
1166 */
1167inline half4 convolution3x3_f16(
1168 Image *src,
1169 const half mat0, const half mat1, const half mat2,
1170 const half mat3, const half mat4, const half mat5,
1171 const half mat6, const half mat7, const half mat8)
1172{
1173 half4 pixels;
1174
1175 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001176 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1177 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001178
1179 return pixels;
1180}
1181
Giorgio Arena76572242018-04-04 17:44:26 +01001182#if defined(DEPTH_MULTIPLIER)
1183
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001184/** This OpenCL kernel computes the depthwise convolution 3x3
1185 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001186 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001187 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1188 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1189 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001190 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001191 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1192 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001193 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001194 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001195 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001196 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1197 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001198 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001199 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001200 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1201 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1202 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1203 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1204 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1205 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1206 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001207 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001208 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1209 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1210 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1211 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1212 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1213 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1214 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001215 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001216 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1217 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1218 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1219 */
1220__kernel void depthwise_convolution_3x3_f16(
1221 TENSOR3D_DECLARATION(src),
1222 TENSOR3D_DECLARATION(dst),
1223 TENSOR3D_DECLARATION(weights)
1224#if defined(HAS_BIAS)
1225 ,
1226 VECTOR_DECLARATION(biases)
1227#endif //defined(HAS_BIAS)
1228)
1229{
1230 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1231 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001232 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001233#if defined(HAS_BIAS)
1234 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1235#endif //defined(HAS_BIAS)
1236
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001237 // Extract channel and linearized batch indices
1238 const int channel = get_global_id(2) % DST_CHANNELS;
1239 const int batch = get_global_id(2) / DST_CHANNELS;
1240 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1241 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1242 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001243
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001244 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001245 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1246 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1247 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001248
1249 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1250 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1251 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1252#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001253 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001254#endif //defined(HAS_BIAS)
1255
Usama Arif6a98a6e2019-05-10 17:07:27 +01001256 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001257}
Giorgio Arena76572242018-04-04 17:44:26 +01001258#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001259#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001260
1261/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1262 * when both stride_x and stride_y are equal to 1
1263 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001264 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001265 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1266 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1267 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001268 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001269 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1270 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001271 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001272 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001273 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001274 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1275 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001276 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001277 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1282 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1283 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1285 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1286 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1287 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1288 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1289 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1290 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1291 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1292 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1293 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1294 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1295 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1296 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1297 */
1298__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1299 TENSOR3D_DECLARATION(src),
1300 TENSOR3D_DECLARATION(dst),
1301 TENSOR3D_DECLARATION(weights)
1302#if defined(HAS_BIAS)
1303 ,
1304 VECTOR_DECLARATION(biases)
1305#endif //defined(HAS_BIAS)
1306)
1307{
1308 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1309 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001310 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1311
1312 // Extract channel and linearized batch indices
1313 const int channel = get_global_id(2) % DST_CHANNELS;
1314 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001315
1316#ifdef HAS_BIAS
1317 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1318
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001319 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001320#endif /* defined(HAS_BIAS) */
1321
1322 half4 pixels0 = 0.0f;
1323 half4 pixels1 = 0.0f;
1324 half4 pixels2 = 0.0f;
1325 half4 pixels3 = 0.0f;
1326
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001327 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1328 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1329 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001330
Usama Arife73686a2019-04-08 17:30:48 +01001331#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001332 // Load the weights
1333 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1334 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1335 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1336
1337 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1338 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1339 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1340 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1341 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1342 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1343 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1344
1345 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1346 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1347 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1348 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1349 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1350 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1351 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1352 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1353 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1354 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1355 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1356 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1357
Usama Arife73686a2019-04-08 17:30:48 +01001358#else /* DILATION_X==1 && DILATION_Y==1 */
1359
1360 //3x3 Convolution of elements starting in 0th row
1361 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1362 //3x3 Convolution of elements starting in 1st row
1363 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1364 //3x3 Convolution of elements starting in 2nd row
1365 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1366 //3x3 Convolution of elements starting in 3rd row
1367 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1368
1369#endif /* DILATION_X==1 && DILATION_Y==1 */
1370
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001371#ifdef HAS_BIAS
1372 pixels0 += (half4)bias;
1373 pixels1 += (half4)bias;
1374 pixels2 += (half4)bias;
1375 pixels3 += (half4)bias;
1376#endif /* defined(HAS_BIAS) */
1377
Usama Arif6a98a6e2019-05-10 17:07:27 +01001378 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1379 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1380 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1381 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001382}
1383
1384/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1385 * when both stride_x and stride_y are equal to 2
1386 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001387 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001388 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1389 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1390 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001391 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001392 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1393 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001394 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001395 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001396 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001397 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001398 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1399 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001400 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1401 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1402 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1403 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1404 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1405 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1406 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1407 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1408 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1409 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1410 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1411 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1412 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1413 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1414 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1415 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1416 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1417 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1418 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1419 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1420 */
1421__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1422 TENSOR3D_DECLARATION(src),
1423 TENSOR3D_DECLARATION(dst),
1424 TENSOR3D_DECLARATION(weights)
1425#if defined(HAS_BIAS)
1426 ,
1427 VECTOR_DECLARATION(biases)
1428#endif //defined(HAS_BIAS)
1429)
1430{
1431 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1432 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001433 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1434
1435 // Extract channel and linearized batch indices
1436 const int channel = get_global_id(2) % DST_CHANNELS;
1437 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001438
1439#ifdef HAS_BIAS
1440 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1441
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001442 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001443#endif /* defined(HAS_BIAS) */
1444
1445 half4 pixels0 = 0.0f;
1446 half4 pixels1 = 0.0f;
1447
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001448 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1449 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1450 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001451
Usama Arife73686a2019-04-08 17:30:48 +01001452#if(DILATION_X == 1 && DILATION_Y == 1)
1453
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001454 // Load the weights
1455 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1456 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1457 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1458
1459 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1460 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1461 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1462 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1463 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1464 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1465 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1466 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1467 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1468 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1469 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1470
1471 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1472 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1473 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1474 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1475 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1476 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1477
Usama Arife73686a2019-04-08 17:30:48 +01001478#else /* DILATION_X==1 && DILATION_Y==1 */
1479 //3x3 Convolution of elements starting in 0th row
1480 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1481 //3x3 Convolution of elements starting in 2nd row
1482 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1483#endif /* DILATION_X==1 && DILATION_Y==1 */
1484
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001485#ifdef HAS_BIAS
1486 pixels0 += (half4)bias;
1487 pixels1 += (half4)bias;
1488#endif /* defined(HAS_BIAS) */
1489
Usama Arif6a98a6e2019-05-10 17:07:27 +01001490 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1491 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001492}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001493#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001494
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001495#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001496
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001497#if DATA_TYPE != float || DATA_TYPE != half
1498#error "Unsupported data type"
1499#endif // DATA_TYPE != float || DATA_TYPE != half
1500
1501#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001502
1503#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1504/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1505 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001506 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001507 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1508 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1509 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1510 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1511 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1512 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001513 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001514 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1515 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001516 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001517 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001518 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001519 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001520 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001521 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001522 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001523 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1524 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1525 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1526 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001527 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1528 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1529 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1530 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1531 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1532 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1533 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001534 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1535 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001536 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001537 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001538 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1539 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1540 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1541 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1542 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1543 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1544 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1545 * @param[in] max_offset Max offset for the input tensor
1546 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1547 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1548 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1549 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1550 */
1551__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001552 TENSOR4D_DECLARATION(src),
1553 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001554 TENSOR3D_DECLARATION(weights),
1555#if defined(HAS_BIAS)
1556 VECTOR_DECLARATION(biases),
1557#endif /* defined(HAS_BIAS) */
1558 int max_offset)
1559{
1560 int x = get_global_id(0); // channels
1561 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001562#if defined(DST_DEPTH)
1563 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1564 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001565#else // defined(DST_DEPTH)
1566 int z = get_global_id(2); // spatial coordinate y
1567#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001568
1569 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1570
Georgios Pinitas37044642018-10-30 14:53:25 +00001571#if defined(DST_DEPTH)
1572 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1573#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001574 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001575#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001576
1577 int z_coord = 0;
1578 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001579 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001580
1581 // We compute 2x1x1 [C,W,H] elements
1582 VEC_FLOAT acc = 0;
1583
1584 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001585 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1586 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1587 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1588 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1589 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1590 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1591 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1592 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1593 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001594
1595 // Load input values
1596 // z == 0
1597 // Clamp z_coord as for z = 0, it can be negative
1598 // z_coord is casted to unsigned int in order to use just a min() operation
1599 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1600 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1601 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1602 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001603 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001604
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001605 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1606 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1607 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001608
1609 // z == 1
1610 // z_coord can be only negative for z = 0 so we do not need to clamp it
1611 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001612 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001613 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001614 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1615 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1616 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001617
1618 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001619 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1620 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1621 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001622 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001623 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1624 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1625 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001626
1627 acc = fma(values0, w0, acc);
1628 acc = fma(values1, w1, acc);
1629 acc = fma(values2, w2, acc);
1630
1631 acc = fma(values3, w3, acc);
1632 acc = fma(values4, w4, acc);
1633 acc = fma(values5, w5, acc);
1634
1635 acc = fma(values6, w6, acc);
1636 acc = fma(values7, w7, acc);
1637 acc = fma(values8, w8, acc);
1638
1639#if defined(HAS_BIAS)
1640 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001641 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001642 acc += bias_values;
1643#endif // defined(HAS_BIAS)
1644
Georgios Pinitas37044642018-10-30 14:53:25 +00001645#if defined(DST_DEPTH)
1646 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1647#else /* defined(DST_DEPTH) */
1648 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1649#endif /* defined(DST_DEPTH) */
1650
Giorgio Arenad051e972018-06-20 11:46:42 +01001651 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001652 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001653}
1654#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1655
1656#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1657/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1658 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001659 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001660 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1661 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1662 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1663 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1664 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1665 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001666 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001667 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1668 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001669 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001670 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001671 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001672 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001673 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001674 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001675 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001676 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1677 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1678 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1679 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001680 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1681 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1682 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1683 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1684 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1685 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1686 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001687 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1688 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001689 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001690 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001691 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1692 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1693 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1694 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1695 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1696 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1697 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1698 * @param[in] max_offset Max offset for the input tensor
1699 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1700 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1701 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1702 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1703 */
1704__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001705 TENSOR4D_DECLARATION(src),
1706 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001707 TENSOR3D_DECLARATION(weights),
1708#if defined(HAS_BIAS)
1709 VECTOR_DECLARATION(biases),
1710#endif /* defined(HAS_BIAS) */
1711 int max_offset)
1712{
1713 int x = get_global_id(0); // channels
1714 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001715#if defined(DST_DEPTH)
1716 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1717 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001718#else // defined(DST_DEPTH)
1719 int z = get_global_id(2); // spatial coordinate y
1720#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001721
1722 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1723
Georgios Pinitas37044642018-10-30 14:53:25 +00001724#if defined(DST_DEPTH)
1725 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1726#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001727 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001728#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001729
1730 int z_coord = 0;
1731 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001732 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001733
1734 // We compute 2x2x2 [C,W,H] elements
1735 VEC_FLOAT acc0 = 0;
1736 VEC_FLOAT acc1 = 0;
1737 VEC_FLOAT acc2 = 0;
1738 VEC_FLOAT acc3 = 0;
1739
1740 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001741 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1742 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1743 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1744 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1745 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1746 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1747 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1748 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1749 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001750
1751 // Load input values
1752 // z == 0
1753 // Clamp z_coord as for z = 0, it can be negative
1754 // z_coord is casted to unsigned int in order to use just a min() operation
1755 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001756 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001757 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1758 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001759 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001760
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001761 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1762 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1763 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1764 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001765
1766 // z == 1
1767 // z_coord can be only negative for z = 0 so we do not need to clamp it
1768 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001769 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001770 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001771 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1772 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1773 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1774 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001775
1776 // z == 2
1777 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1778 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1779 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001780 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001781 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1782 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1783 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1784 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001785
1786 // z == 3
1787 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1788 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001789 offset += (int4)src_stride_z;
1790 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001791 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1792 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1793 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1794 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001795
1796 acc0 = fma(values0, w0, acc0);
1797 acc0 = fma(values1, w1, acc0);
1798 acc0 = fma(values2, w2, acc0);
1799 acc1 = fma(values1, w0, acc1);
1800 acc1 = fma(values2, w1, acc1);
1801 acc1 = fma(values3, w2, acc1);
1802
1803 acc0 = fma(values4, w3, acc0);
1804 acc0 = fma(values5, w4, acc0);
1805 acc0 = fma(values6, w5, acc0);
1806 acc1 = fma(values5, w3, acc1);
1807 acc1 = fma(values6, w4, acc1);
1808 acc1 = fma(values7, w5, acc1);
1809
1810 acc0 = fma(values8, w6, acc0);
1811 acc0 = fma(values9, w7, acc0);
1812 acc0 = fma(values10, w8, acc0);
1813 acc1 = fma(values9, w6, acc1);
1814 acc1 = fma(values10, w7, acc1);
1815 acc1 = fma(values11, w8, acc1);
1816
1817 acc2 = fma(values4, w0, acc2);
1818 acc2 = fma(values5, w1, acc2);
1819 acc2 = fma(values6, w2, acc2);
1820 acc3 = fma(values5, w0, acc3);
1821 acc3 = fma(values6, w1, acc3);
1822 acc3 = fma(values7, w2, acc3);
1823
1824 acc2 = fma(values8, w3, acc2);
1825 acc2 = fma(values9, w4, acc2);
1826 acc2 = fma(values10, w5, acc2);
1827 acc3 = fma(values9, w3, acc3);
1828 acc3 = fma(values10, w4, acc3);
1829 acc3 = fma(values11, w5, acc3);
1830
1831 acc2 = fma(values12, w6, acc2);
1832 acc2 = fma(values13, w7, acc2);
1833 acc2 = fma(values14, w8, acc2);
1834 acc3 = fma(values13, w6, acc3);
1835 acc3 = fma(values14, w7, acc3);
1836 acc3 = fma(values15, w8, acc3);
1837
1838#if defined(HAS_BIAS)
1839 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1840
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001841 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001842
1843 acc0 += bias_values;
1844 acc1 += bias_values;
1845 acc2 += bias_values;
1846 acc3 += bias_values;
1847#endif // defined(HAS_BIAS)
1848
Georgios Pinitas37044642018-10-30 14:53:25 +00001849#if defined(DST_DEPTH)
1850 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1851#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001852 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001853#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001854
1855 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001856 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001857 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001858 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001859
1860#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1861 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1862#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1863 {
1864 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001865 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001866 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001867 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001868 }
1869}
1870
1871#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +01001872#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)