blob: 1b2f5cccaad163718c52dd66ed347cda9206594c [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010024#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010027
28/** Get the pointer position at a certain offset in x and y direction.
29 *
30 * @param[in] ptr Pointer to the starting position of the buffer
31 * @param[in] x Relative X position
32 * @param[in] y Relative Y position
33 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
34 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
35 *
36 * @return a uchar
37 */
38inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
39{
40 return ptr + x * stride_x + y * stride_y;
41}
42
43#if(DILATION_X == 1 && DILATION_Y == 1)
44
45#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
46 ({ \
47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
53 })
54
55#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
56 ({ \
57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
69 })
70
71#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
72 ({ \
73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
79 })
80
81#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
82 ({ \
83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
95 })
96
97#else /* DILATION_X==1 && DILATION_Y==1 */
98
99#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
100 ({ \
101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
107 })
108
109#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
110 ({ \
111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
117 })
118
119#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
120 ({ \
121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
133 })
134
135#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
136 ({ \
137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
149 })
150
151#endif /* DILATION_X==1 && DILATION_Y==1 */
152
153#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100154#if defined(CONV_STRIDE_X)
155
Giorgio Arena93a690e2017-08-01 16:09:33 +0100156#if CONV_STRIDE_X == 1
157#define convolution1x3 convolution1x3_stride_1
158#elif CONV_STRIDE_X == 2
159#define convolution1x3 convolution1x3_stride_2
160#elif CONV_STRIDE_X == 3
161#define convolution1x3 convolution1x3_stride_3
162#else /* CONV_STRIDE_X */
163#error "Stride not supported"
164#endif /* CONV_STRIDE_X */
165
166/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
167 *
168 * @param[in] left_pixel Pointer to the left pixel.
169 * @param[in] left_coeff Weight of the left pixel
170 * @param[in] middle_coeff Weight of the middle pixel
171 * @param[in] right_coeff Weight of the right pixel
172 *
173 * @return a float2 containing 2 convoluted values.
174 */
175inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
179{
Usama Arife73686a2019-04-08 17:30:48 +0100180#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100181 float4 temp = vload4(0, (__global float *)left_pixel);
182
183 float2 left = CONVERT(temp.s01, float2);
184 float2 middle = CONVERT(temp.s12, float2);
185 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100187#else /* DILATION_X==1 && DILATION_Y==1 */
188 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
191#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100192}
193
194/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
195 *
196 * @param[in] left_pixel Pointer to the left pixel.
197 * @param[in] left_coeff Weight of the left pixel
198 * @param[in] middle_coeff Weight of the middle pixel
199 * @param[in] right_coeff Weight of the right pixel
200 *
201 * @return a float2 containing 2 convoluted values.
202 */
203inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
207{
Usama Arife73686a2019-04-08 17:30:48 +0100208#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209 float4 temp0 = vload4(0, (__global float *)left_pixel);
210 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
211
212 float2 left = CONVERT(temp0.s02, float2);
213 float2 middle = CONVERT(temp0.s13, float2);
214 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
215
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100217#else /* DILATION_X==1 && DILATION_Y==1 */
218 __global float *left_pixel_float = (__global float *)left_pixel;
219
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
223
224#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100225}
226
227/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
228 *
229 * @param[in] left_pixel Pointer to the left pixel.
230 * @param[in] left_coeff Weight of the left pixel
231 * @param[in] middle_coeff Weight of the middle pixel
232 * @param[in] right_coeff Weight of the right pixel
233 *
234 * @return a float2 containing 2 convoluted values.
235 */
236inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
240{
Usama Arife73686a2019-04-08 17:30:48 +0100241#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100242 float4 temp0 = vload4(0, (__global float *)left_pixel);
243 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
244
245 float2 left = CONVERT(temp0.s03, float2);
246 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
248
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100250#else /* DILATION_X==1 && DILATION_Y==1 */
251 __global float *left_pixel_float = (__global float *)left_pixel;
252
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
256#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100257}
258
259/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
260 *
261 * Convolution matrix layout:
262 *
263 * [ mat0, mat1, mat2 ]\n
264 * [ mat3, mat4, mat5 ]\n
265 * [ mat6, mat7, mat8 ]\n
266 *
267 * @param[in] src A pointer to source Image structure
268 * @param[in] mat0 Coefficient from the convolution matrix
269 * @param[in] mat1 Coefficient from the convolution matrix
270 * @param[in] mat2 Coefficient from the convolution matrix
271 * @param[in] mat3 Coefficient from the convolution matrix
272 * @param[in] mat4 Coefficient from the convolution matrix
273 * @param[in] mat5 Coefficient from the convolution matrix
274 * @param[in] mat6 Coefficient from the convolution matrix
275 * @param[in] mat0 Coefficient from the convolution matrix
276 * @param[in] mat7 Coefficient from the convolution matrix
277 * @param[in] mat8 Coefficient from the convolution matrix
278 *
279 * @return a float2 containing 2 convoluted values.
280 */
281inline float2 convolution3x3(
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100282 __global const uchar *src,
283 unsigned int src_stride_y,
Giorgio Arena93a690e2017-08-01 16:09:33 +0100284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100290 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100293
294 return pixels;
295}
296
Gian Marcoc799ed82018-02-01 16:57:48 +0000297/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000298 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100340{
341 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
342 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100343 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100344
345 float2 pixels = 0.0f;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100346
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100347 // Extract channel and linearized batch indices
348 const int channel = get_global_id(2) % DST_CHANNELS;
349 const int batch = get_global_id(2) / DST_CHANNELS;
350 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100351
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100352 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100353
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100354 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100355
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100356 // Load the weights
357 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
360
361 pixels = convolution3x3(src_addr, src_stride_y,
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100365#if defined(HAS_BIAS)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100366 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
367
368 float bias = *((__global float *)(vector_offset(&biases, channel)));
369
370 pixels += (float2)bias;
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100371#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100372
Usama Arif6a98a6e2019-05-10 17:07:27 +0100373 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100374}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100375#endif //defined(CONV_STRIDE_X)
376
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100377#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100378
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100379/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100380 *
381 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
383 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
384 * @param[in] y_offset Offset from the source tensor from which to start convolution
385 * @param[in] weights_addr Pointer from where to get weights
386 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
387 */
388inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
390{
391 // Load the weights
392 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
395
396 float2 pixels0 = 0.0f;
397
398 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
399 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401
402 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
403 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405
406 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
407 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409
410 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
411 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
412 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
413
414 return pixels0;
415}
416
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100417/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100418 *
419 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
420 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
422 * @param[in] y_offset Offset from the source tensor from which to start convolution
423 * @param[in] weights_addr Pointer from where to get weights
424 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
425 */
426inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
428{
429 // Load the weights
430 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
433
434 float2 pixels0 = 0.0f;
435
436 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
437 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439
440 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
441 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443
444 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
445 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
451
452 return pixels0;
453}
454
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100455#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100456
Gian Marcoc799ed82018-02-01 16:57:48 +0000457/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
458 * stride_x and stride_y are equal to 1
459 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100460 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100461 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
462 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
463 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100464 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000465 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
466 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000467 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000468 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000469 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000470 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000471 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
472 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
473 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
482 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
493 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000494__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000495 TENSOR3D_DECLARATION(src),
496 TENSOR3D_DECLARATION(dst),
497 TENSOR3D_DECLARATION(weights)
498#if defined(HAS_BIAS)
499 ,
500 VECTOR_DECLARATION(biases)
501#endif //defined(HAS_BIAS)
502)
503{
504 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
505 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100506 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000507
508 float2 pixels0 = 0.0f;
509 float2 pixels1 = 0.0f;
510 float2 pixels2 = 0.0f;
511 float2 pixels3 = 0.0f;
512
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100513 // Extract channel and linearized batch indices
514 const int channel = get_global_id(2) % DST_CHANNELS;
515 const int batch = get_global_id(2) / DST_CHANNELS;
516 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
517 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
518 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000519
Usama Arife73686a2019-04-08 17:30:48 +0100520#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 // Load the weights
522 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
525
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000526 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000527 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
528 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
529 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
530 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000531 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
532 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000533
534 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
540 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
541 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
542 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
543 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
544 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
545 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
546
Usama Arife73686a2019-04-08 17:30:48 +0100547#else /* DILATION_X==1 && DILATION_Y==1 */
548
549 //3x3 Convolution of elements starting in 0th row
550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
551 //3x3 Convolution of elements starting in 1st row
552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
553 //3x3 Convolution of elements starting in 2nd row
554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
555 //3x3 Convolution of elements starting in 3rd row
556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
557
558#endif /* DILATION_X==1 && DILATION_Y==1 */
559
Gian Marcoc799ed82018-02-01 16:57:48 +0000560#ifdef HAS_BIAS
561 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
562
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100563 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000564
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
569#endif /* defined(HAS_BIAS) */
570
Usama Arif6a98a6e2019-05-10 17:07:27 +0100571 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000575}
576
577/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
578 * stride_x and stride_y are equal to 2
579 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100580 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100581 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
582 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
583 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100584 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000614__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
624 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
625 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100626 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000627
628 float2 pixels0 = 0.0f;
629 float2 pixels1 = 0.0f;
630
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100631 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000632 const int channel = get_global_id(2) % DST_CHANNELS;
633 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100634 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
635 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
636 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000637
Usama Arife73686a2019-04-08 17:30:48 +0100638#if(DILATION_X == 1 && DILATION_Y == 1)
639
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
657 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
663
Usama Arife73686a2019-04-08 17:30:48 +0100664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
668 //3x3 Convolution of elements starting in 2nd row
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
Gian Marcoc799ed82018-02-01 16:57:48 +0000672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
Usama Arif6a98a6e2019-05-10 17:07:27 +0100681 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000683}
684
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100686
giuros016d109962019-01-07 17:47:19 +0000687#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
688/** Reshape the weights for quantized depthwise convolution
689 *
690 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
691 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
692 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
693 * @attention Input's height and width should be 3
694 *
695 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
696 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
697 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
698 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
699 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
700 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
701 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
702 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
703 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
704 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
705 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
706 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
707 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
708 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
709 */
710__kernel void depthwise_convolution_reshape_weights(
711 TENSOR3D_DECLARATION(src),
712 IMAGE_DECLARATION(dst))
713{
714 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
715 const int x = get_global_id(0);
716
717 // Load 3x3xVEC_SIZE weights
718 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
719 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
720 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
721 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
722 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
723 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
724 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
725 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
726 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
727 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
728 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
729 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
730 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
731 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
732 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
733 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
734 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
735 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
736
737 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
738
739#if defined(TRANSPOSE)
740#if VEC_SIZE != 4
741#error "VEC_SIZE not supported"
742#else // VEC_SIZE != 4
743 VSTORE(VEC_SIZE)
744 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
745 VSTORE(VEC_SIZE)
746 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
747 VSTORE(VEC_SIZE)
748 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
749 VSTORE(VEC_SIZE)
750 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
751 VSTORE(VEC_SIZE)
752 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
753 VSTORE(VEC_SIZE)
754 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
755 VSTORE(VEC_SIZE)
756 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
757 VSTORE(VEC_SIZE)
758 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
759 VSTORE(VEC_SIZE)
760 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
761#endif // VEC_SIZE != 4
762#else // !defined(TRANSPOSE)
763 VSTORE(VEC_SIZE)
764 (w0, 0, dst_addr + 0);
765 VSTORE(VEC_SIZE)
766 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
767 VSTORE(VEC_SIZE)
768 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
769 VSTORE(VEC_SIZE)
770 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
771 VSTORE(VEC_SIZE)
772 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
773 VSTORE(VEC_SIZE)
774 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
775 VSTORE(VEC_SIZE)
776 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
777 VSTORE(VEC_SIZE)
778 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
779 VSTORE(VEC_SIZE)
780 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
781#endif // defined(TRANSPOSE)
782}
783#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
784
Giorgio Arenad051e972018-06-20 11:46:42 +0100785#if defined(NCHW)
786#define in_stride_x src_stride_x
787#define in_stride_y src_stride_y
788#define in_stride_z src_stride_z
789#define out_stride_x dst_stride_x
790#define out_stride_y dst_stride_y
791#define out_stride_z dst_stride_z
792#else //defined(NCHW)
793#define in_stride_x src_stride_y
794#define in_stride_y src_stride_z
795#define in_stride_z src_stride_x
796#define out_stride_x dst_stride_y
797#define out_stride_y dst_stride_z
798#define out_stride_z dst_stride_x
799#endif //defined(NCHW)
800
Giorgio Arena9fe41442017-08-23 16:36:24 +0100801#if defined(SRC_WIDTH) && defined(DATA_TYPE)
802/** This kernel reshapes each of the tensor's low three dimensions to single rows.
803 *
804 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
805 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100806 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
807 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
808 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
809 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
810 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
811 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
812 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
813 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
814 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
815 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
816 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
817 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
818 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
819 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
820 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
821 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
822 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
823 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100824 */
giuros016d109962019-01-07 17:47:19 +0000825__kernel void depthwise_convolution_reshape_weights_generic(
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100826 TENSOR3D_DECLARATION(src),
827 IMAGE_DECLARATION(dst)
828#ifdef HAS_BIAS
829 ,
830 VECTOR_DECLARATION(biases)
831#endif /* HAS_BIAS */
832)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100833{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100834#ifdef HAS_BIAS
835 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
836#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100837
Giorgio Arenad051e972018-06-20 11:46:42 +0100838 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
839 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100840
Giorgio Arenad051e972018-06-20 11:46:42 +0100841 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100842 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100843 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100844 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100845
846#if defined(HAS_BIAS)
847 if(get_global_id(1) == 0)
848 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100849 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100850 }
851#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100852}
853#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
854
Usama Arife73686a2019-04-08 17:30:48 +0100855#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100856/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
857 *
858 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100859 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Usama Arife73686a2019-04-08 17:30:48 +0100860 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Giorgio Arena9fe41442017-08-23 16:36:24 +0100861 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100862 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100863 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
864 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
865 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
866 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
867 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
868 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
869 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
870 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
871 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
872 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
873 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
874 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
875 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
876 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
877 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
878 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100879__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
880{
881 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
882
883 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100884 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Usama Arife73686a2019-04-08 17:30:48 +0100885 const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100886
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100887 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
888 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100889 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100890
Giorgio Arenad051e972018-06-20 11:46:42 +0100891 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100892 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
893
Usama Arife73686a2019-04-08 17:30:48 +0100894 for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100895 {
Usama Arife73686a2019-04-08 17:30:48 +0100896 for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100897 {
898 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
899 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000900 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100901 }
902 else
903 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100904 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100905 }
906 }
907 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100908#if defined(HAS_BIAS)
909 *output_ptr = (DATA_TYPE)(1);
910#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100911}
912
Giorgio Arena76572242018-04-04 17:44:26 +0100913#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100914
915#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
916
917/** This kernel performs a reshaping of the output of the depthwise generic convolution.
918 *
919 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
920 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
921 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100922 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100923 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
924 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
925 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
926 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
927 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
928 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
929 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
930 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
931 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
932 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
933 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
934 */
935__kernel void depthwise_vector_to_tensor(
936 VECTOR_DECLARATION(src),
937 TENSOR3D_DECLARATION(dst))
938{
939 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
940
941 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
942 const int id0 = get_global_id(0);
943 const int z = id0 / patch_size;
944 const int index2D = id0 - z * patch_size;
945
Giorgio Arenad051e972018-06-20 11:46:42 +0100946 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100947 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
948}
949
950#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000951
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100952#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000953#if defined(CONV_STRIDE_X)
954#if CONV_STRIDE_X == 1
955#define convolution1x3_f16 convolution1x3_stride_1_f16
956#elif CONV_STRIDE_X == 2
957#define convolution1x3_f16 convolution1x3_stride_2_f16
958#elif CONV_STRIDE_X == 3
959#define convolution1x3_f16 convolution1x3_stride_3_f16
960#else /* CONV_STRIDE_X */
961#error "Stride not supported"
962#endif /* CONV_STRIDE_X */
963
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100964#if(DILATION_X > 1 || DILATION_Y > 1)
965
966/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
967 *
968 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
969 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
970 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
971 * @param[in] y_offset Offset from the source tensor from which to start convolution
972 * @param[in] weights_addr Pointer from where to get weights
973 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
974 */
975inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
976 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
977{
978 // Load the weights
979 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
980 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
981 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
982
983 half4 pixels0 = 0.0f;
984
985 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
986 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
987 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
988
989 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
990 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
991 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
992
993 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
994 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
995 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
996
997 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
998 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
999 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1000
1001 return pixels0;
1002}
1003
1004/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
1005 *
1006 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
1007 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1008 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1009 * @param[in] y_offset Offset from the source tensor from which to start convolution
1010 * @param[in] weights_addr Pointer from where to get weights
1011 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
1012 */
1013inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
1014 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
1015{
1016 // Load the weights
1017 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1018 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1019 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1020
1021 half4 pixels0 = 0.0f;
1022
1023 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
1024 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1025 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1026
1027 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
1028 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1029 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1030
1031 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
1032 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1033 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1034
1035 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
1036 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
1037 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1038
1039 return pixels0;
1040}
1041
1042#endif // (DILATION_X > 1 && DILATION_Y > 1)
1043
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001044/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
1045 *
1046 * @param[in] left_pixel Pointer to the left pixel.
1047 * @param[in] left_coeff Weight of the left pixel
1048 * @param[in] middle_coeff Weight of the middle pixel
1049 * @param[in] right_coeff Weight of the right pixel
1050 *
1051 * @return a half4 containing 4 convoluted values.
1052 */
1053inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
1054 const half left_coeff,
1055 const half middle_coeff,
1056 const half right_coeff)
1057{
Usama Arife73686a2019-04-08 17:30:48 +01001058#if(DILATION_X == 1 && DILATION_Y == 1)
1059
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001060 half8 temp = vload8(0, (__global half *)left_pixel);
1061
1062 half4 left = CONVERT(temp.s0123, half4);
1063 half4 middle = CONVERT(temp.s1234, half4);
1064 half4 right = CONVERT(temp.s2345, half4);
1065
1066 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001067#else /* DILATION_X==1 && DILATION_Y==1 */
1068 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
1069 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
1070 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
1071
1072#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001073}
1074
1075/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
1076 *
1077 * @param[in] left_pixel Pointer to the left pixel.
1078 * @param[in] left_coeff Weight of the left pixel
1079 * @param[in] middle_coeff Weight of the middle pixel
1080 * @param[in] right_coeff Weight of the right pixel
1081 *
1082 * @return a half4 containing 4 convoluted values.
1083 */
1084inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
1085 const half left_coeff,
1086 const half middle_coeff,
1087 const half right_coeff)
1088{
Usama Arife73686a2019-04-08 17:30:48 +01001089#if(DILATION_X == 1 && DILATION_Y == 1)
1090
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001091 half8 temp0 = vload8(0, (__global half *)left_pixel);
1092 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
1093
1094 half4 left = CONVERT(temp0.s0246, half4);
1095 half4 middle = CONVERT(temp0.s1357, half4);
1096 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
1097
1098 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001099#else /* DILATION_X==1 && DILATION_Y==1 */
1100
1101 __global half *left_pixel_float = (__global half *)left_pixel;
1102
1103 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
1104 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
1105 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
1106
1107#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001108}
1109
1110/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
1111 *
1112 * @param[in] left_pixel Pointer to the left pixel.
1113 * @param[in] left_coeff Weight of the left pixel
1114 * @param[in] middle_coeff Weight of the middle pixel
1115 * @param[in] right_coeff Weight of the right pixel
1116 *
1117 * @return a half4 containing 4 convoluted values.
1118 */
1119inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
1120 const half left_coeff,
1121 const half middle_coeff,
1122 const half right_coeff)
1123{
Usama Arife73686a2019-04-08 17:30:48 +01001124#if(DILATION_X == 1 && DILATION_Y == 1)
1125
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001126 half16 temp0 = vload16(0, (__global half *)left_pixel);
1127
1128 half4 left = CONVERT(temp0.s0369, half4);
1129 half4 middle = CONVERT(temp0.s147A, half4);
1130 half4 right = CONVERT(temp0.s258B, half4);
1131
1132 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001133#else /* DILATION_X==1 && DILATION_Y==1 */
1134
1135 __global half *left_pixel_float = (__global half *)left_pixel;
1136
1137 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
1138 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
1139 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
1140
1141#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001142}
1143
1144/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
1145 *
1146 * Convolution matrix layout:
1147 *
1148 * [ mat0, mat1, mat2 ]\n
1149 * [ mat3, mat4, mat5 ]\n
1150 * [ mat6, mat7, mat8 ]\n
1151 *
1152 * @param[in] src A pointer to source Image structure
1153 * @param[in] mat0 Coefficient from the convolution matrix
1154 * @param[in] mat1 Coefficient from the convolution matrix
1155 * @param[in] mat2 Coefficient from the convolution matrix
1156 * @param[in] mat3 Coefficient from the convolution matrix
1157 * @param[in] mat4 Coefficient from the convolution matrix
1158 * @param[in] mat5 Coefficient from the convolution matrix
1159 * @param[in] mat6 Coefficient from the convolution matrix
1160 * @param[in] mat0 Coefficient from the convolution matrix
1161 * @param[in] mat7 Coefficient from the convolution matrix
1162 * @param[in] mat8 Coefficient from the convolution matrix
1163 *
1164 * @return a half4 containing 4 convoluted values.
1165 */
1166inline half4 convolution3x3_f16(
1167 Image *src,
1168 const half mat0, const half mat1, const half mat2,
1169 const half mat3, const half mat4, const half mat5,
1170 const half mat6, const half mat7, const half mat8)
1171{
1172 half4 pixels;
1173
1174 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001175 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1176 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001177
1178 return pixels;
1179}
1180
Giorgio Arena76572242018-04-04 17:44:26 +01001181#if defined(DEPTH_MULTIPLIER)
1182
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001183/** This OpenCL kernel computes the depthwise convolution 3x3
1184 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001185 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001186 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1187 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1188 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001189 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001190 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1191 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001192 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001193 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001194 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001195 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1196 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001197 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001198 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001199 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1200 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1201 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1202 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1203 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1204 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1205 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001206 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001207 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1208 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1209 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1210 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1211 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1212 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1213 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001214 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001215 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1216 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1217 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1218 */
1219__kernel void depthwise_convolution_3x3_f16(
1220 TENSOR3D_DECLARATION(src),
1221 TENSOR3D_DECLARATION(dst),
1222 TENSOR3D_DECLARATION(weights)
1223#if defined(HAS_BIAS)
1224 ,
1225 VECTOR_DECLARATION(biases)
1226#endif //defined(HAS_BIAS)
1227)
1228{
1229 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1230 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001231 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001232#if defined(HAS_BIAS)
1233 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1234#endif //defined(HAS_BIAS)
1235
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001236 // Extract channel and linearized batch indices
1237 const int channel = get_global_id(2) % DST_CHANNELS;
1238 const int batch = get_global_id(2) / DST_CHANNELS;
1239 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1240 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1241 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001242
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001243 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001244 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1245 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1246 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001247
1248 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1249 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1250 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1251#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001252 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001253#endif //defined(HAS_BIAS)
1254
Usama Arif6a98a6e2019-05-10 17:07:27 +01001255 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001256}
Giorgio Arena76572242018-04-04 17:44:26 +01001257#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001258#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001259
1260/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1261 * when both stride_x and stride_y are equal to 1
1262 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001263 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001264 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1265 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1266 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001267 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001268 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1269 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001270 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001271 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001272 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001273 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1274 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001275 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001276 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1277 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1278 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1279 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1280 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1281 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1282 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1283 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1284 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1285 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1286 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1287 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1288 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1289 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1290 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1291 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1292 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1293 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1294 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1295 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1296 */
1297__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1298 TENSOR3D_DECLARATION(src),
1299 TENSOR3D_DECLARATION(dst),
1300 TENSOR3D_DECLARATION(weights)
1301#if defined(HAS_BIAS)
1302 ,
1303 VECTOR_DECLARATION(biases)
1304#endif //defined(HAS_BIAS)
1305)
1306{
1307 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1308 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001309 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1310
1311 // Extract channel and linearized batch indices
1312 const int channel = get_global_id(2) % DST_CHANNELS;
1313 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001314
1315#ifdef HAS_BIAS
1316 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1317
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001318 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001319#endif /* defined(HAS_BIAS) */
1320
1321 half4 pixels0 = 0.0f;
1322 half4 pixels1 = 0.0f;
1323 half4 pixels2 = 0.0f;
1324 half4 pixels3 = 0.0f;
1325
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001326 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1327 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1328 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001329
Usama Arife73686a2019-04-08 17:30:48 +01001330#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001331 // Load the weights
1332 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1333 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1334 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1335
1336 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1337 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1338 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1339 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1340 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1341 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1342 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1343
1344 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1345 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1346 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1347 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1348 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1349 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1350 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1351 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1352 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1353 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1354 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1355 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1356
Usama Arife73686a2019-04-08 17:30:48 +01001357#else /* DILATION_X==1 && DILATION_Y==1 */
1358
1359 //3x3 Convolution of elements starting in 0th row
1360 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1361 //3x3 Convolution of elements starting in 1st row
1362 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1363 //3x3 Convolution of elements starting in 2nd row
1364 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1365 //3x3 Convolution of elements starting in 3rd row
1366 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1367
1368#endif /* DILATION_X==1 && DILATION_Y==1 */
1369
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001370#ifdef HAS_BIAS
1371 pixels0 += (half4)bias;
1372 pixels1 += (half4)bias;
1373 pixels2 += (half4)bias;
1374 pixels3 += (half4)bias;
1375#endif /* defined(HAS_BIAS) */
1376
Usama Arif6a98a6e2019-05-10 17:07:27 +01001377 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1378 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1379 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1380 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001381}
1382
1383/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1384 * when both stride_x and stride_y are equal to 2
1385 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001386 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001387 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1388 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1389 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001390 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001391 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1392 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001393 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001394 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001395 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001396 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001397 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1398 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001399 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1400 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1401 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1402 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1403 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1404 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1405 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1406 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1407 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1408 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1409 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1410 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1411 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1412 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1413 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1414 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1415 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1416 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1417 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1418 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1419 */
1420__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1421 TENSOR3D_DECLARATION(src),
1422 TENSOR3D_DECLARATION(dst),
1423 TENSOR3D_DECLARATION(weights)
1424#if defined(HAS_BIAS)
1425 ,
1426 VECTOR_DECLARATION(biases)
1427#endif //defined(HAS_BIAS)
1428)
1429{
1430 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1431 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001432 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1433
1434 // Extract channel and linearized batch indices
1435 const int channel = get_global_id(2) % DST_CHANNELS;
1436 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001437
1438#ifdef HAS_BIAS
1439 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1440
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001441 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001442#endif /* defined(HAS_BIAS) */
1443
1444 half4 pixels0 = 0.0f;
1445 half4 pixels1 = 0.0f;
1446
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001447 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1448 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1449 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001450
Usama Arife73686a2019-04-08 17:30:48 +01001451#if(DILATION_X == 1 && DILATION_Y == 1)
1452
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001453 // Load the weights
1454 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1455 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1456 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1457
1458 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1459 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1460 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1461 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1462 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1463 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1464 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1465 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1466 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1467 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1468 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1469
1470 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1471 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1472 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1473 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1474 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1475 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1476
Usama Arife73686a2019-04-08 17:30:48 +01001477#else /* DILATION_X==1 && DILATION_Y==1 */
1478 //3x3 Convolution of elements starting in 0th row
1479 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1480 //3x3 Convolution of elements starting in 2nd row
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001481 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001482#endif /* DILATION_X==1 && DILATION_Y==1 */
1483
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001484#ifdef HAS_BIAS
1485 pixels0 += (half4)bias;
1486 pixels1 += (half4)bias;
1487#endif /* defined(HAS_BIAS) */
1488
Usama Arif6a98a6e2019-05-10 17:07:27 +01001489 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1490 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001491}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001492#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001493
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001494#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1495/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1496 *
1497 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1498 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1499 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1500 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1501 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1502 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1503 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1504 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1505 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1506 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1507 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1508 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1509 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1510 *
1511 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1512 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1513 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1514 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1515 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1516 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1517 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1518 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1519 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1520 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1521 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1522 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1523 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1524 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1525 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1526 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1527 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1528 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1529 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1530 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1531 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1532 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1533 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1534 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1535 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1536 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1537 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1538 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1539 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1540 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1541 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1542 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1543 */
1544__kernel void dwc_MxN_native_fp_nhwc(
1545 TENSOR4D_DECLARATION(src),
1546 TENSOR4D_DECLARATION(dst),
1547 TENSOR3D_DECLARATION(weights),
1548#if defined(HAS_BIAS)
1549 VECTOR_DECLARATION(biases)
1550#endif // defined(HAS_BIAS)
1551)
1552{
1553 int x = get_global_id(0); // channels
1554 int y = get_global_id(1); // spatial coordinate x
1555#if defined(DST_DEPTH)
1556 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1557 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1558#else // defined(DST_DEPTH)
1559 int z = get_global_id(2); // spatial coordinate y
1560#endif // defined(DST_DEPTH)
1561
1562 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes +
1563 x * sizeof(DATA_TYPE) * (int)N0;
1564
1565 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes +
1566 x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 +
1567 y * dst_stride_y +
1568 z * dst_stride_z;
1569
1570 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes +
1571 x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
1572
1573#if defined(HAS_BIAS)
1574 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes +
1575 x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
1576#endif // defined(HAS_BIAS)
1577
1578#if defined(DST_DEPTH)
1579 s_addr += b * src_stride_w;
1580 d_addr += b * dst_stride_w;
1581#endif // defined(DST_DEPTH)
1582
1583 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1584 {
1585 // Each work-item computes N0x1x1 elements
1586 VEC_DATA_TYPE(DATA_TYPE, N0)
1587 res = 0;
1588
1589 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1590 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1591
1592 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1593 {
1594 if(y_coord >= 0 && y_coord < SRC_DIM2)
1595 {
1596 int x_coord_tmp = x_coord;
1597
1598 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1599 {
1600 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1601 {
1602 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1603 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1604
1605 // Load input and weights values
1606 VEC_DATA_TYPE(DATA_TYPE, N0)
1607 i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
1608 VEC_DATA_TYPE(DATA_TYPE, N0)
1609 w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
1610
1611#if GPU_ARCH == GPU_ARCH_MIDGARD
1612 res += i * w;
1613#else // GPU_ARCH == GPU_ARCH_MIDGARD
1614 res = fma(i, w, res);
1615#endif // GPU_ARCH == GPU_ARCH_MIDGARD
1616 }
1617 x_coord_tmp += DILATION_X;
1618 }
1619 }
1620 y_coord += DILATION_Y;
1621 }
1622
1623#if defined(HAS_BIAS)
1624 res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
1625#endif // defined(HAS_BIAS)
1626
1627 res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
1628
1629 VSTORE(N0)
1630 (res, 0, (__global DATA_TYPE *)(d_addr));
1631
1632 w_addr += sizeof(DATA_TYPE);
1633 d_addr += sizeof(DATA_TYPE);
1634#if defined(HAS_BIAS)
1635 b_addr += sizeof(DATA_TYPE);
1636#endif // defined(HAS_BIAS)
1637 }
1638}
1639#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1640
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001641#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001642
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001643#if DATA_TYPE != float || DATA_TYPE != half
1644#error "Unsupported data type"
1645#endif // DATA_TYPE != float || DATA_TYPE != half
1646
1647#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001648
1649#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001650
Giorgio Arenad051e972018-06-20 11:46:42 +01001651/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1652 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001653 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001654 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1655 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1656 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1657 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1658 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1659 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001660 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001661 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1662 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001663 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001664 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001665 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001666 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001667 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001668 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001669 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001670 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1671 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1672 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1673 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001674 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1675 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1676 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1677 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1678 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1679 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1680 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001681 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1682 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001683 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001684 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001685 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1686 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1687 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1688 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1689 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1690 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1691 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1692 * @param[in] max_offset Max offset for the input tensor
1693 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1694 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1695 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1696 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1697 */
1698__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001699 TENSOR4D_DECLARATION(src),
1700 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001701 TENSOR3D_DECLARATION(weights),
1702#if defined(HAS_BIAS)
1703 VECTOR_DECLARATION(biases),
1704#endif /* defined(HAS_BIAS) */
1705 int max_offset)
1706{
1707 int x = get_global_id(0); // channels
1708 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001709#if defined(DST_DEPTH)
1710 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1711 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001712#else // defined(DST_DEPTH)
1713 int z = get_global_id(2); // spatial coordinate y
1714#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001715
1716 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1717
Georgios Pinitas37044642018-10-30 14:53:25 +00001718#if defined(DST_DEPTH)
1719 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1720#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001721 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001722#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001723
1724 int z_coord = 0;
1725 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001726 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001727
1728 // We compute 2x1x1 [C,W,H] elements
1729 VEC_FLOAT acc = 0;
1730
1731 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001732 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1733 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1734 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1735 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1736 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1737 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1738 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1739 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1740 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001741
1742 // Load input values
1743 // z == 0
1744 // Clamp z_coord as for z = 0, it can be negative
1745 // z_coord is casted to unsigned int in order to use just a min() operation
1746 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1747 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1748 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1749 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001750 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001751
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001752 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1753 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1754 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001755
1756 // z == 1
1757 // z_coord can be only negative for z = 0 so we do not need to clamp it
1758 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001759 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001760 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001761 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1762 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1763 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001764
1765 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001766 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1767 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1768 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001769 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001770 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1771 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1772 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001773
1774 acc = fma(values0, w0, acc);
1775 acc = fma(values1, w1, acc);
1776 acc = fma(values2, w2, acc);
1777
1778 acc = fma(values3, w3, acc);
1779 acc = fma(values4, w4, acc);
1780 acc = fma(values5, w5, acc);
1781
1782 acc = fma(values6, w6, acc);
1783 acc = fma(values7, w7, acc);
1784 acc = fma(values8, w8, acc);
1785
1786#if defined(HAS_BIAS)
1787 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001788 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001789 acc += bias_values;
1790#endif // defined(HAS_BIAS)
1791
Georgios Pinitas37044642018-10-30 14:53:25 +00001792#if defined(DST_DEPTH)
1793 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1794#else /* defined(DST_DEPTH) */
1795 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1796#endif /* defined(DST_DEPTH) */
1797
Giorgio Arenad051e972018-06-20 11:46:42 +01001798 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001799 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001800}
1801#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1802
1803#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1804/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1805 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001806 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001807 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1808 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1809 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1810 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1811 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1812 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001813 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001814 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1815 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001816 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001817 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001818 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001819 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001820 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001821 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001822 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001823 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1824 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1825 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1826 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001827 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1828 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1829 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1830 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1831 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1832 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1833 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001834 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1835 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001836 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001837 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001838 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1839 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1840 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1841 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1842 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1843 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1844 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1845 * @param[in] max_offset Max offset for the input tensor
1846 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1847 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1848 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1849 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1850 */
1851__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001852 TENSOR4D_DECLARATION(src),
1853 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001854 TENSOR3D_DECLARATION(weights),
1855#if defined(HAS_BIAS)
1856 VECTOR_DECLARATION(biases),
1857#endif /* defined(HAS_BIAS) */
1858 int max_offset)
1859{
1860 int x = get_global_id(0); // channels
1861 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001862#if defined(DST_DEPTH)
1863 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1864 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001865#else // defined(DST_DEPTH)
1866 int z = get_global_id(2); // spatial coordinate y
1867#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001868
1869 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1870
Georgios Pinitas37044642018-10-30 14:53:25 +00001871#if defined(DST_DEPTH)
1872 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1873#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001874 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001875#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001876
1877 int z_coord = 0;
1878 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001879 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001880
1881 // We compute 2x2x2 [C,W,H] elements
1882 VEC_FLOAT acc0 = 0;
1883 VEC_FLOAT acc1 = 0;
1884 VEC_FLOAT acc2 = 0;
1885 VEC_FLOAT acc3 = 0;
1886
1887 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001888 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1889 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1890 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1891 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1892 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1893 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1894 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1895 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1896 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001897
1898 // Load input values
1899 // z == 0
1900 // Clamp z_coord as for z = 0, it can be negative
1901 // z_coord is casted to unsigned int in order to use just a min() operation
1902 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001903 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001904 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1905 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001906 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001907
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001908 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1909 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1910 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1911 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001912
1913 // z == 1
1914 // z_coord can be only negative for z = 0 so we do not need to clamp it
1915 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001916 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001917 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001918 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1919 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1920 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1921 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001922
1923 // z == 2
1924 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1925 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1926 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001927 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001928 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1929 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1930 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1931 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001932
1933 // z == 3
1934 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1935 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001936 offset += (int4)src_stride_z;
1937 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001938 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1939 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1940 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1941 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001942
1943 acc0 = fma(values0, w0, acc0);
1944 acc0 = fma(values1, w1, acc0);
1945 acc0 = fma(values2, w2, acc0);
1946 acc1 = fma(values1, w0, acc1);
1947 acc1 = fma(values2, w1, acc1);
1948 acc1 = fma(values3, w2, acc1);
1949
1950 acc0 = fma(values4, w3, acc0);
1951 acc0 = fma(values5, w4, acc0);
1952 acc0 = fma(values6, w5, acc0);
1953 acc1 = fma(values5, w3, acc1);
1954 acc1 = fma(values6, w4, acc1);
1955 acc1 = fma(values7, w5, acc1);
1956
1957 acc0 = fma(values8, w6, acc0);
1958 acc0 = fma(values9, w7, acc0);
1959 acc0 = fma(values10, w8, acc0);
1960 acc1 = fma(values9, w6, acc1);
1961 acc1 = fma(values10, w7, acc1);
1962 acc1 = fma(values11, w8, acc1);
1963
1964 acc2 = fma(values4, w0, acc2);
1965 acc2 = fma(values5, w1, acc2);
1966 acc2 = fma(values6, w2, acc2);
1967 acc3 = fma(values5, w0, acc3);
1968 acc3 = fma(values6, w1, acc3);
1969 acc3 = fma(values7, w2, acc3);
1970
1971 acc2 = fma(values8, w3, acc2);
1972 acc2 = fma(values9, w4, acc2);
1973 acc2 = fma(values10, w5, acc2);
1974 acc3 = fma(values9, w3, acc3);
1975 acc3 = fma(values10, w4, acc3);
1976 acc3 = fma(values11, w5, acc3);
1977
1978 acc2 = fma(values12, w6, acc2);
1979 acc2 = fma(values13, w7, acc2);
1980 acc2 = fma(values14, w8, acc2);
1981 acc3 = fma(values13, w6, acc3);
1982 acc3 = fma(values14, w7, acc3);
1983 acc3 = fma(values15, w8, acc3);
1984
1985#if defined(HAS_BIAS)
1986 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1987
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001988 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001989
1990 acc0 += bias_values;
1991 acc1 += bias_values;
1992 acc2 += bias_values;
1993 acc3 += bias_values;
1994#endif // defined(HAS_BIAS)
1995
Georgios Pinitas37044642018-10-30 14:53:25 +00001996#if defined(DST_DEPTH)
1997 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1998#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001999 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00002000#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01002001
2002 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01002003 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01002004 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01002005 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01002006
2007#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
2008 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
2009#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
2010 {
2011 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01002012 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01002013 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01002014 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01002015 }
2016}
2017
2018#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +01002019#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)