blob: 3a227282ff75f11676f43d021a9b614725c03f70 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010024#include "helpers.h"
25
Usama Arif6a98a6e2019-05-10 17:07:27 +010026#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010027
28/** Get the pointer position at a certain offset in x and y direction.
29 *
30 * @param[in] ptr Pointer to the starting position of the buffer
31 * @param[in] x Relative X position
32 * @param[in] y Relative Y position
33 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
34 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
35 *
36 * @return a uchar
37 */
38inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
39{
40 return ptr + x * stride_x + y * stride_y;
41}
42
43#if(DILATION_X == 1 && DILATION_Y == 1)
44
45#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
46 ({ \
47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
53 })
54
55#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
56 ({ \
57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
69 })
70
71#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
72 ({ \
73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
79 })
80
81#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
82 ({ \
83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
95 })
96
97#else /* DILATION_X==1 && DILATION_Y==1 */
98
99#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
100 ({ \
101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
107 })
108
109#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
110 ({ \
111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
117 })
118
119#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
120 ({ \
121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
133 })
134
135#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
136 ({ \
137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
149 })
150
151#endif /* DILATION_X==1 && DILATION_Y==1 */
152
153#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100154#if defined(CONV_STRIDE_X)
155
Giorgio Arena93a690e2017-08-01 16:09:33 +0100156#if CONV_STRIDE_X == 1
157#define convolution1x3 convolution1x3_stride_1
158#elif CONV_STRIDE_X == 2
159#define convolution1x3 convolution1x3_stride_2
160#elif CONV_STRIDE_X == 3
161#define convolution1x3 convolution1x3_stride_3
162#else /* CONV_STRIDE_X */
163#error "Stride not supported"
164#endif /* CONV_STRIDE_X */
165
166/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
167 *
168 * @param[in] left_pixel Pointer to the left pixel.
169 * @param[in] left_coeff Weight of the left pixel
170 * @param[in] middle_coeff Weight of the middle pixel
171 * @param[in] right_coeff Weight of the right pixel
172 *
173 * @return a float2 containing 2 convoluted values.
174 */
175inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
179{
Usama Arife73686a2019-04-08 17:30:48 +0100180#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100181 float4 temp = vload4(0, (__global float *)left_pixel);
182
183 float2 left = CONVERT(temp.s01, float2);
184 float2 middle = CONVERT(temp.s12, float2);
185 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100187#else /* DILATION_X==1 && DILATION_Y==1 */
188 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
191#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100192}
193
194/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
195 *
196 * @param[in] left_pixel Pointer to the left pixel.
197 * @param[in] left_coeff Weight of the left pixel
198 * @param[in] middle_coeff Weight of the middle pixel
199 * @param[in] right_coeff Weight of the right pixel
200 *
201 * @return a float2 containing 2 convoluted values.
202 */
203inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
207{
Usama Arife73686a2019-04-08 17:30:48 +0100208#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209 float4 temp0 = vload4(0, (__global float *)left_pixel);
210 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
211
212 float2 left = CONVERT(temp0.s02, float2);
213 float2 middle = CONVERT(temp0.s13, float2);
214 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
215
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100217#else /* DILATION_X==1 && DILATION_Y==1 */
218 __global float *left_pixel_float = (__global float *)left_pixel;
219
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
223
224#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100225}
226
227/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
228 *
229 * @param[in] left_pixel Pointer to the left pixel.
230 * @param[in] left_coeff Weight of the left pixel
231 * @param[in] middle_coeff Weight of the middle pixel
232 * @param[in] right_coeff Weight of the right pixel
233 *
234 * @return a float2 containing 2 convoluted values.
235 */
236inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
240{
Usama Arife73686a2019-04-08 17:30:48 +0100241#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100242 float4 temp0 = vload4(0, (__global float *)left_pixel);
243 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
244
245 float2 left = CONVERT(temp0.s03, float2);
246 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
248
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100250#else /* DILATION_X==1 && DILATION_Y==1 */
251 __global float *left_pixel_float = (__global float *)left_pixel;
252
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
256#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100257}
258
259/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
260 *
261 * Convolution matrix layout:
262 *
263 * [ mat0, mat1, mat2 ]\n
264 * [ mat3, mat4, mat5 ]\n
265 * [ mat6, mat7, mat8 ]\n
266 *
267 * @param[in] src A pointer to source Image structure
268 * @param[in] mat0 Coefficient from the convolution matrix
269 * @param[in] mat1 Coefficient from the convolution matrix
270 * @param[in] mat2 Coefficient from the convolution matrix
271 * @param[in] mat3 Coefficient from the convolution matrix
272 * @param[in] mat4 Coefficient from the convolution matrix
273 * @param[in] mat5 Coefficient from the convolution matrix
274 * @param[in] mat6 Coefficient from the convolution matrix
275 * @param[in] mat0 Coefficient from the convolution matrix
276 * @param[in] mat7 Coefficient from the convolution matrix
277 * @param[in] mat8 Coefficient from the convolution matrix
278 *
279 * @return a float2 containing 2 convoluted values.
280 */
281inline float2 convolution3x3(
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100282 __global const uchar *src,
283 unsigned int src_stride_y,
Giorgio Arena93a690e2017-08-01 16:09:33 +0100284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100290 pixels = convolution1x3((src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels += convolution1x3((src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels += convolution1x3((src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100293
294 return pixels;
295}
296
Gian Marcoc799ed82018-02-01 16:57:48 +0000297/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000298 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100340{
341 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
342 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100343 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100344
345 float2 pixels = 0.0f;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100346
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100347 // Extract channel and linearized batch indices
348 const int channel = get_global_id(2) % DST_CHANNELS;
349 const int batch = get_global_id(2) / DST_CHANNELS;
350 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100351
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100352 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100353
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100354 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Giorgio Arena93a690e2017-08-01 16:09:33 +0100355
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100356 // Load the weights
357 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
360
361 pixels = convolution3x3(src_addr, src_stride_y,
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100365#if defined(HAS_BIAS)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +0100366 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
367
368 float bias = *((__global float *)(vector_offset(&biases, channel)));
369
370 pixels += (float2)bias;
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100371#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100372
Usama Arif6a98a6e2019-05-10 17:07:27 +0100373 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100374}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100375#endif //defined(CONV_STRIDE_X)
376
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100377#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100378
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100379/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100380 *
381 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
382 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
383 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
384 * @param[in] y_offset Offset from the source tensor from which to start convolution
385 * @param[in] weights_addr Pointer from where to get weights
386 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
387 */
388inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
390{
391 // Load the weights
392 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
395
396 float2 pixels0 = 0.0f;
397
398 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
399 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
401
402 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
403 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
405
406 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
407 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
409
410 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
411 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
412 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
413
414 return pixels0;
415}
416
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100417/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100418 *
419 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
420 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
421 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
422 * @param[in] y_offset Offset from the source tensor from which to start convolution
423 * @param[in] weights_addr Pointer from where to get weights
424 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
425 */
426inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
428{
429 // Load the weights
430 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
433
434 float2 pixels0 = 0.0f;
435
436 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
437 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
439
440 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
441 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
443
444 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
445 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
447
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
451
452 return pixels0;
453}
454
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100455#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100456
Gian Marcoc799ed82018-02-01 16:57:48 +0000457/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
458 * stride_x and stride_y are equal to 1
459 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100460 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100461 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
462 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
463 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100464 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000465 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
466 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000467 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000468 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000469 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000470 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000471 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
472 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
473 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
474 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
482 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
493 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000494__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000495 TENSOR3D_DECLARATION(src),
496 TENSOR3D_DECLARATION(dst),
497 TENSOR3D_DECLARATION(weights)
498#if defined(HAS_BIAS)
499 ,
500 VECTOR_DECLARATION(biases)
501#endif //defined(HAS_BIAS)
502)
503{
504 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
505 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100506 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000507
508 float2 pixels0 = 0.0f;
509 float2 pixels1 = 0.0f;
510 float2 pixels2 = 0.0f;
511 float2 pixels3 = 0.0f;
512
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100513 // Extract channel and linearized batch indices
514 const int channel = get_global_id(2) % DST_CHANNELS;
515 const int batch = get_global_id(2) / DST_CHANNELS;
516 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
517 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
518 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000519
Usama Arife73686a2019-04-08 17:30:48 +0100520#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 // Load the weights
522 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
525
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000526 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000527 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
528 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
529 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
530 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000531 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
532 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000533
534 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
540 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
541 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
542 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
543 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
544 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
545 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
546
Usama Arife73686a2019-04-08 17:30:48 +0100547#else /* DILATION_X==1 && DILATION_Y==1 */
548
549 //3x3 Convolution of elements starting in 0th row
550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
551 //3x3 Convolution of elements starting in 1st row
552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
553 //3x3 Convolution of elements starting in 2nd row
554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
555 //3x3 Convolution of elements starting in 3rd row
556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
557
558#endif /* DILATION_X==1 && DILATION_Y==1 */
559
Gian Marcoc799ed82018-02-01 16:57:48 +0000560#ifdef HAS_BIAS
561 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
562
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100563 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000564
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
569#endif /* defined(HAS_BIAS) */
570
Usama Arif6a98a6e2019-05-10 17:07:27 +0100571 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
572 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
573 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
574 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000575}
576
577/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
578 * stride_x and stride_y are equal to 2
579 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100580 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100581 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
582 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
583 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100584 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000614__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
624 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
625 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100626 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000627
628 float2 pixels0 = 0.0f;
629 float2 pixels1 = 0.0f;
630
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100631 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000632 const int channel = get_global_id(2) % DST_CHANNELS;
633 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100634 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
635 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
636 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000637
Usama Arife73686a2019-04-08 17:30:48 +0100638#if(DILATION_X == 1 && DILATION_Y == 1)
639
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
657 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
663
Usama Arife73686a2019-04-08 17:30:48 +0100664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
668 //3x3 Convolution of elements starting in 2nd row
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
Gian Marcoc799ed82018-02-01 16:57:48 +0000672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
Usama Arif6a98a6e2019-05-10 17:07:27 +0100681 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000683}
684
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100686
giuros016d109962019-01-07 17:47:19 +0000687#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
688/** Reshape the weights for quantized depthwise convolution
689 *
690 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
691 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
692 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
693 * @attention Input's height and width should be 3
694 *
695 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
696 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
697 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
698 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
699 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
700 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
701 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
702 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
703 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
704 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
705 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
706 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
707 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
708 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
709 */
710__kernel void depthwise_convolution_reshape_weights(
711 TENSOR3D_DECLARATION(src),
712 IMAGE_DECLARATION(dst))
713{
714 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
715 const int x = get_global_id(0);
716
717 // Load 3x3xVEC_SIZE weights
718 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
719 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
720 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
721 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
722 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
723 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
724 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
725 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
726 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
727 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
728 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
729 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
730 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
731 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
732 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
733 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
734 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
735 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
736
737 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
738
739#if defined(TRANSPOSE)
740#if VEC_SIZE != 4
741#error "VEC_SIZE not supported"
742#else // VEC_SIZE != 4
743 VSTORE(VEC_SIZE)
744 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
745 VSTORE(VEC_SIZE)
746 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
747 VSTORE(VEC_SIZE)
748 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
749 VSTORE(VEC_SIZE)
750 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
751 VSTORE(VEC_SIZE)
752 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
753 VSTORE(VEC_SIZE)
754 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
755 VSTORE(VEC_SIZE)
756 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
757 VSTORE(VEC_SIZE)
758 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
759 VSTORE(VEC_SIZE)
760 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
761#endif // VEC_SIZE != 4
762#else // !defined(TRANSPOSE)
763 VSTORE(VEC_SIZE)
764 (w0, 0, dst_addr + 0);
765 VSTORE(VEC_SIZE)
766 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
767 VSTORE(VEC_SIZE)
768 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
769 VSTORE(VEC_SIZE)
770 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
771 VSTORE(VEC_SIZE)
772 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
773 VSTORE(VEC_SIZE)
774 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
775 VSTORE(VEC_SIZE)
776 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
777 VSTORE(VEC_SIZE)
778 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
779 VSTORE(VEC_SIZE)
780 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
781#endif // defined(TRANSPOSE)
782}
783#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
784
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100785#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000786#if defined(CONV_STRIDE_X)
787#if CONV_STRIDE_X == 1
788#define convolution1x3_f16 convolution1x3_stride_1_f16
789#elif CONV_STRIDE_X == 2
790#define convolution1x3_f16 convolution1x3_stride_2_f16
791#elif CONV_STRIDE_X == 3
792#define convolution1x3_f16 convolution1x3_stride_3_f16
793#else /* CONV_STRIDE_X */
794#error "Stride not supported"
795#endif /* CONV_STRIDE_X */
796
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100797#if(DILATION_X > 1 || DILATION_Y > 1)
798
799/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
800 *
801 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
802 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
803 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
804 * @param[in] y_offset Offset from the source tensor from which to start convolution
805 * @param[in] weights_addr Pointer from where to get weights
806 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
807 */
808inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
809 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
810{
811 // Load the weights
812 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
813 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
814 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
815
816 half4 pixels0 = 0.0f;
817
818 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
819 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
820 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
821
822 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
823 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
824 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
825
826 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
827 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
828 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
829
830 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
831 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
832 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
833
834 return pixels0;
835}
836
837/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
838 *
839 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
840 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
841 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
842 * @param[in] y_offset Offset from the source tensor from which to start convolution
843 * @param[in] weights_addr Pointer from where to get weights
844 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
845 */
846inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
847 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
848{
849 // Load the weights
850 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
851 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
852 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
853
854 half4 pixels0 = 0.0f;
855
856 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
857 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
858 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
859
860 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
861 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
862 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
863
864 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
865 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
866 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
867
868 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
869 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
870 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
871
872 return pixels0;
873}
874
875#endif // (DILATION_X > 1 && DILATION_Y > 1)
876
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000877/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
878 *
879 * @param[in] left_pixel Pointer to the left pixel.
880 * @param[in] left_coeff Weight of the left pixel
881 * @param[in] middle_coeff Weight of the middle pixel
882 * @param[in] right_coeff Weight of the right pixel
883 *
884 * @return a half4 containing 4 convoluted values.
885 */
886inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
887 const half left_coeff,
888 const half middle_coeff,
889 const half right_coeff)
890{
Usama Arife73686a2019-04-08 17:30:48 +0100891#if(DILATION_X == 1 && DILATION_Y == 1)
892
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000893 half8 temp = vload8(0, (__global half *)left_pixel);
894
895 half4 left = CONVERT(temp.s0123, half4);
896 half4 middle = CONVERT(temp.s1234, half4);
897 half4 right = CONVERT(temp.s2345, half4);
898
899 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100900#else /* DILATION_X==1 && DILATION_Y==1 */
901 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
902 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
903 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
904
905#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000906}
907
908/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
909 *
910 * @param[in] left_pixel Pointer to the left pixel.
911 * @param[in] left_coeff Weight of the left pixel
912 * @param[in] middle_coeff Weight of the middle pixel
913 * @param[in] right_coeff Weight of the right pixel
914 *
915 * @return a half4 containing 4 convoluted values.
916 */
917inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
918 const half left_coeff,
919 const half middle_coeff,
920 const half right_coeff)
921{
Usama Arife73686a2019-04-08 17:30:48 +0100922#if(DILATION_X == 1 && DILATION_Y == 1)
923
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000924 half8 temp0 = vload8(0, (__global half *)left_pixel);
925 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
926
927 half4 left = CONVERT(temp0.s0246, half4);
928 half4 middle = CONVERT(temp0.s1357, half4);
929 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
930
931 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100932#else /* DILATION_X==1 && DILATION_Y==1 */
933
934 __global half *left_pixel_float = (__global half *)left_pixel;
935
936 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
937 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
938 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
939
940#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000941}
942
943/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
944 *
945 * @param[in] left_pixel Pointer to the left pixel.
946 * @param[in] left_coeff Weight of the left pixel
947 * @param[in] middle_coeff Weight of the middle pixel
948 * @param[in] right_coeff Weight of the right pixel
949 *
950 * @return a half4 containing 4 convoluted values.
951 */
952inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
953 const half left_coeff,
954 const half middle_coeff,
955 const half right_coeff)
956{
Usama Arife73686a2019-04-08 17:30:48 +0100957#if(DILATION_X == 1 && DILATION_Y == 1)
958
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000959 half16 temp0 = vload16(0, (__global half *)left_pixel);
960
961 half4 left = CONVERT(temp0.s0369, half4);
962 half4 middle = CONVERT(temp0.s147A, half4);
963 half4 right = CONVERT(temp0.s258B, half4);
964
965 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100966#else /* DILATION_X==1 && DILATION_Y==1 */
967
968 __global half *left_pixel_float = (__global half *)left_pixel;
969
970 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
971 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
972 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
973
974#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000975}
976
977/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
978 *
979 * Convolution matrix layout:
980 *
981 * [ mat0, mat1, mat2 ]\n
982 * [ mat3, mat4, mat5 ]\n
983 * [ mat6, mat7, mat8 ]\n
984 *
985 * @param[in] src A pointer to source Image structure
986 * @param[in] mat0 Coefficient from the convolution matrix
987 * @param[in] mat1 Coefficient from the convolution matrix
988 * @param[in] mat2 Coefficient from the convolution matrix
989 * @param[in] mat3 Coefficient from the convolution matrix
990 * @param[in] mat4 Coefficient from the convolution matrix
991 * @param[in] mat5 Coefficient from the convolution matrix
992 * @param[in] mat6 Coefficient from the convolution matrix
993 * @param[in] mat0 Coefficient from the convolution matrix
994 * @param[in] mat7 Coefficient from the convolution matrix
995 * @param[in] mat8 Coefficient from the convolution matrix
996 *
997 * @return a half4 containing 4 convoluted values.
998 */
999inline half4 convolution3x3_f16(
1000 Image *src,
1001 const half mat0, const half mat1, const half mat2,
1002 const half mat3, const half mat4, const half mat5,
1003 const half mat6, const half mat7, const half mat8)
1004{
1005 half4 pixels;
1006
1007 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001008 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1009 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001010
1011 return pixels;
1012}
1013
Giorgio Arena76572242018-04-04 17:44:26 +01001014#if defined(DEPTH_MULTIPLIER)
1015
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001016/** This OpenCL kernel computes the depthwise convolution 3x3
1017 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001018 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001019 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1020 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1021 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001022 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001023 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1024 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001025 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001026 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001027 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001028 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1029 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001030 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001031 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001032 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1033 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1034 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1035 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1036 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1037 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1038 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001039 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001040 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1041 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1042 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1043 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1044 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1045 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1046 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001047 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001048 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1049 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1050 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1051 */
1052__kernel void depthwise_convolution_3x3_f16(
1053 TENSOR3D_DECLARATION(src),
1054 TENSOR3D_DECLARATION(dst),
1055 TENSOR3D_DECLARATION(weights)
1056#if defined(HAS_BIAS)
1057 ,
1058 VECTOR_DECLARATION(biases)
1059#endif //defined(HAS_BIAS)
1060)
1061{
1062 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1063 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001064 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001065#if defined(HAS_BIAS)
1066 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1067#endif //defined(HAS_BIAS)
1068
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001069 // Extract channel and linearized batch indices
1070 const int channel = get_global_id(2) % DST_CHANNELS;
1071 const int batch = get_global_id(2) / DST_CHANNELS;
1072 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1073 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1074 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001075
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001076 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001077 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1078 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1079 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001080
1081 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1082 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1083 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1084#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001085 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001086#endif //defined(HAS_BIAS)
1087
Usama Arif6a98a6e2019-05-10 17:07:27 +01001088 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001089}
Giorgio Arena76572242018-04-04 17:44:26 +01001090#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001091#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001092
1093/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1094 * when both stride_x and stride_y are equal to 1
1095 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001096 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001097 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1098 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1099 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001100 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001101 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1102 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001103 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001104 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001105 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001106 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1107 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001108 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001109 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1110 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1111 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1112 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1113 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1114 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1115 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1116 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1117 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1118 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1119 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1120 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1121 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1122 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1123 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1124 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1125 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1126 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1127 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1128 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1129 */
1130__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1131 TENSOR3D_DECLARATION(src),
1132 TENSOR3D_DECLARATION(dst),
1133 TENSOR3D_DECLARATION(weights)
1134#if defined(HAS_BIAS)
1135 ,
1136 VECTOR_DECLARATION(biases)
1137#endif //defined(HAS_BIAS)
1138)
1139{
1140 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1141 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001142 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1143
1144 // Extract channel and linearized batch indices
1145 const int channel = get_global_id(2) % DST_CHANNELS;
1146 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001147
1148#ifdef HAS_BIAS
1149 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1150
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001151 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001152#endif /* defined(HAS_BIAS) */
1153
1154 half4 pixels0 = 0.0f;
1155 half4 pixels1 = 0.0f;
1156 half4 pixels2 = 0.0f;
1157 half4 pixels3 = 0.0f;
1158
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001159 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1160 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1161 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001162
Usama Arife73686a2019-04-08 17:30:48 +01001163#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001164 // Load the weights
1165 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1166 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1167 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1168
1169 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1170 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1171 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1172 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1173 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1174 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1175 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1176
1177 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1178 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1179 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1180 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1181 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1182 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1183 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1184 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1185 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1186 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1187 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1188 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1189
Usama Arife73686a2019-04-08 17:30:48 +01001190#else /* DILATION_X==1 && DILATION_Y==1 */
1191
1192 //3x3 Convolution of elements starting in 0th row
1193 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1194 //3x3 Convolution of elements starting in 1st row
1195 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1196 //3x3 Convolution of elements starting in 2nd row
1197 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1198 //3x3 Convolution of elements starting in 3rd row
1199 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1200
1201#endif /* DILATION_X==1 && DILATION_Y==1 */
1202
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001203#ifdef HAS_BIAS
1204 pixels0 += (half4)bias;
1205 pixels1 += (half4)bias;
1206 pixels2 += (half4)bias;
1207 pixels3 += (half4)bias;
1208#endif /* defined(HAS_BIAS) */
1209
Usama Arif6a98a6e2019-05-10 17:07:27 +01001210 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1211 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1212 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1213 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001214}
1215
1216/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1217 * when both stride_x and stride_y are equal to 2
1218 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001219 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001220 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1221 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1222 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001223 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001224 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1225 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001226 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001227 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001228 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001229 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001230 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1231 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001232 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1233 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1234 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1235 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1236 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1237 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1238 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1239 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1240 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1241 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1242 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1243 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1244 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1245 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1246 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1247 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1248 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1249 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1250 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1251 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1252 */
1253__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1254 TENSOR3D_DECLARATION(src),
1255 TENSOR3D_DECLARATION(dst),
1256 TENSOR3D_DECLARATION(weights)
1257#if defined(HAS_BIAS)
1258 ,
1259 VECTOR_DECLARATION(biases)
1260#endif //defined(HAS_BIAS)
1261)
1262{
1263 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1264 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001265 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1266
1267 // Extract channel and linearized batch indices
1268 const int channel = get_global_id(2) % DST_CHANNELS;
1269 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001270
1271#ifdef HAS_BIAS
1272 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1273
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001274 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001275#endif /* defined(HAS_BIAS) */
1276
1277 half4 pixels0 = 0.0f;
1278 half4 pixels1 = 0.0f;
1279
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001280 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1281 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1282 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001283
Usama Arife73686a2019-04-08 17:30:48 +01001284#if(DILATION_X == 1 && DILATION_Y == 1)
1285
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001286 // Load the weights
1287 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1288 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1289 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1290
1291 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1292 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1293 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1294 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1295 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1296 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1297 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1298 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1299 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1300 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1301 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1302
1303 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1304 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1305 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1306 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1307 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1308 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1309
Usama Arife73686a2019-04-08 17:30:48 +01001310#else /* DILATION_X==1 && DILATION_Y==1 */
1311 //3x3 Convolution of elements starting in 0th row
1312 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1313 //3x3 Convolution of elements starting in 2nd row
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001314 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +01001315#endif /* DILATION_X==1 && DILATION_Y==1 */
1316
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001317#ifdef HAS_BIAS
1318 pixels0 += (half4)bias;
1319 pixels1 += (half4)bias;
1320#endif /* defined(HAS_BIAS) */
1321
Usama Arif6a98a6e2019-05-10 17:07:27 +01001322 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1323 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001324}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001325#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001326
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001327#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1328/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1329 *
1330 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
1331 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1332 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1333 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1334 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1335 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1336 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1337 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1338 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1339 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1340 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1341 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1342 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1343 *
1344 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
1345 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1346 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1347 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1348 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1349 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1350 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1351 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1352 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1353 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1354 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1355 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1356 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1357 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1358 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1359 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1360 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1361 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1362 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1363 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1364 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
1365 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1366 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1367 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1368 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1369 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1370 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1371 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1372 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1373 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1374 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1375 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1376 */
1377__kernel void dwc_MxN_native_fp_nhwc(
1378 TENSOR4D_DECLARATION(src),
1379 TENSOR4D_DECLARATION(dst),
1380 TENSOR3D_DECLARATION(weights),
1381#if defined(HAS_BIAS)
1382 VECTOR_DECLARATION(biases)
1383#endif // defined(HAS_BIAS)
1384)
1385{
1386 int x = get_global_id(0); // channels
1387 int y = get_global_id(1); // spatial coordinate x
1388#if defined(DST_DEPTH)
1389 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1390 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1391#else // defined(DST_DEPTH)
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001392 int z = get_global_id(2); // spatial coordinate y
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001393#endif // defined(DST_DEPTH)
1394
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001395 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001396
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001397 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001398
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001399 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001400
1401#if defined(HAS_BIAS)
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001402 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001403#endif // defined(HAS_BIAS)
1404
1405#if defined(DST_DEPTH)
1406 s_addr += b * src_stride_w;
1407 d_addr += b * dst_stride_w;
1408#endif // defined(DST_DEPTH)
1409
1410 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1411 {
1412 // Each work-item computes N0x1x1 elements
1413 VEC_DATA_TYPE(DATA_TYPE, N0)
1414 res = 0;
1415
1416 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1417 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1418
1419 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1420 {
1421 if(y_coord >= 0 && y_coord < SRC_DIM2)
1422 {
1423 int x_coord_tmp = x_coord;
1424
1425 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1426 {
1427 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1428 {
1429 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1430 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1431
1432 // Load input and weights values
1433 VEC_DATA_TYPE(DATA_TYPE, N0)
1434 i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));
1435 VEC_DATA_TYPE(DATA_TYPE, N0)
1436 w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));
1437
1438#if GPU_ARCH == GPU_ARCH_MIDGARD
1439 res += i * w;
1440#else // GPU_ARCH == GPU_ARCH_MIDGARD
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001441 res = fma(i, w, res);
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001442#endif // GPU_ARCH == GPU_ARCH_MIDGARD
1443 }
1444 x_coord_tmp += DILATION_X;
1445 }
1446 }
1447 y_coord += DILATION_Y;
1448 }
1449
1450#if defined(HAS_BIAS)
1451 res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));
1452#endif // defined(HAS_BIAS)
1453
1454 res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, res, A_VAL, B_VAL);
1455
1456 VSTORE(N0)
1457 (res, 0, (__global DATA_TYPE *)(d_addr));
1458
1459 w_addr += sizeof(DATA_TYPE);
1460 d_addr += sizeof(DATA_TYPE);
1461#if defined(HAS_BIAS)
1462 b_addr += sizeof(DATA_TYPE);
1463#endif // defined(HAS_BIAS)
1464 }
1465}
1466#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP)
1467
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001468#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001469
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001470#if DATA_TYPE != float || DATA_TYPE != half
1471#error "Unsupported data type"
1472#endif // DATA_TYPE != float || DATA_TYPE != half
1473
1474#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001475
1476#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +01001477
Giorgio Arenad051e972018-06-20 11:46:42 +01001478/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1479 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001480 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001481 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1482 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1483 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1484 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1485 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1486 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001487 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001488 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1489 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001490 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001491 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001492 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001493 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001494 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001495 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001496 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001497 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1498 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1499 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1500 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001501 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1502 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1503 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1504 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1505 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1506 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1507 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001508 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1509 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001510 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001511 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001512 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1513 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1514 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1515 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1516 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1517 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1518 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1519 * @param[in] max_offset Max offset for the input tensor
1520 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1521 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1522 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1523 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1524 */
1525__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001526 TENSOR4D_DECLARATION(src),
1527 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001528 TENSOR3D_DECLARATION(weights),
1529#if defined(HAS_BIAS)
1530 VECTOR_DECLARATION(biases),
1531#endif /* defined(HAS_BIAS) */
1532 int max_offset)
1533{
1534 int x = get_global_id(0); // channels
1535 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001536#if defined(DST_DEPTH)
1537 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1538 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001539#else // defined(DST_DEPTH)
1540 int z = get_global_id(2); // spatial coordinate y
1541#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001542
1543 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1544
Georgios Pinitas37044642018-10-30 14:53:25 +00001545#if defined(DST_DEPTH)
1546 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1547#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001548 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001549#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001550
1551 int z_coord = 0;
1552 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001553 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001554
1555 // We compute 2x1x1 [C,W,H] elements
1556 VEC_FLOAT acc = 0;
1557
1558 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001559 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1560 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1561 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1562 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1563 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1564 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1565 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1566 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1567 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001568
1569 // Load input values
1570 // z == 0
1571 // Clamp z_coord as for z = 0, it can be negative
1572 // z_coord is casted to unsigned int in order to use just a min() operation
1573 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1574 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1575 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1576 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001577 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001578
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001579 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1580 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1581 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001582
1583 // z == 1
1584 // z_coord can be only negative for z = 0 so we do not need to clamp it
1585 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001586 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001587 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001588 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1589 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1590 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001591
1592 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001593 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1594 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1595 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001596 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001597 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1598 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1599 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001600
1601 acc = fma(values0, w0, acc);
1602 acc = fma(values1, w1, acc);
1603 acc = fma(values2, w2, acc);
1604
1605 acc = fma(values3, w3, acc);
1606 acc = fma(values4, w4, acc);
1607 acc = fma(values5, w5, acc);
1608
1609 acc = fma(values6, w6, acc);
1610 acc = fma(values7, w7, acc);
1611 acc = fma(values8, w8, acc);
1612
1613#if defined(HAS_BIAS)
1614 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001615 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001616 acc += bias_values;
1617#endif // defined(HAS_BIAS)
1618
Georgios Pinitas37044642018-10-30 14:53:25 +00001619#if defined(DST_DEPTH)
1620 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1621#else /* defined(DST_DEPTH) */
1622 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1623#endif /* defined(DST_DEPTH) */
1624
Giorgio Arenad051e972018-06-20 11:46:42 +01001625 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001626 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001627}
1628#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1629
1630#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1631/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1632 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001633 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001634 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1635 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1636 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1637 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1638 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1639 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001640 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001641 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1642 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001643 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001644 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001645 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001646 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001647 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001648 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001649 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001650 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1651 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1652 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1653 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001654 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1655 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1656 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1657 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1658 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1659 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1660 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001661 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1662 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001663 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001664 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001665 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1666 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1667 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1668 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1669 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1670 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1671 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1672 * @param[in] max_offset Max offset for the input tensor
1673 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1674 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1675 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1676 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1677 */
1678__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001679 TENSOR4D_DECLARATION(src),
1680 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001681 TENSOR3D_DECLARATION(weights),
1682#if defined(HAS_BIAS)
1683 VECTOR_DECLARATION(biases),
1684#endif /* defined(HAS_BIAS) */
1685 int max_offset)
1686{
1687 int x = get_global_id(0); // channels
1688 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001689#if defined(DST_DEPTH)
1690 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1691 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001692#else // defined(DST_DEPTH)
1693 int z = get_global_id(2); // spatial coordinate y
1694#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001695
1696 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1697
Georgios Pinitas37044642018-10-30 14:53:25 +00001698#if defined(DST_DEPTH)
1699 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1700#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001701 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001702#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001703
1704 int z_coord = 0;
1705 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001706 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001707
1708 // We compute 2x2x2 [C,W,H] elements
1709 VEC_FLOAT acc0 = 0;
1710 VEC_FLOAT acc1 = 0;
1711 VEC_FLOAT acc2 = 0;
1712 VEC_FLOAT acc3 = 0;
1713
1714 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001715 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1716 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1717 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1718 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1719 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1720 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1721 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1722 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1723 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001724
1725 // Load input values
1726 // z == 0
1727 // Clamp z_coord as for z = 0, it can be negative
1728 // z_coord is casted to unsigned int in order to use just a min() operation
1729 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001730 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001731 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1732 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001733 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001734
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001735 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1736 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1737 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1738 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001739
1740 // z == 1
1741 // z_coord can be only negative for z = 0 so we do not need to clamp it
1742 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001743 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001744 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001745 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1746 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1747 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1748 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001749
1750 // z == 2
1751 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1752 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1753 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001754 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001755 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1756 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1757 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1758 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001759
1760 // z == 3
1761 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1762 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001763 offset += (int4)src_stride_z;
1764 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001765 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1766 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1767 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1768 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001769
1770 acc0 = fma(values0, w0, acc0);
1771 acc0 = fma(values1, w1, acc0);
1772 acc0 = fma(values2, w2, acc0);
1773 acc1 = fma(values1, w0, acc1);
1774 acc1 = fma(values2, w1, acc1);
1775 acc1 = fma(values3, w2, acc1);
1776
1777 acc0 = fma(values4, w3, acc0);
1778 acc0 = fma(values5, w4, acc0);
1779 acc0 = fma(values6, w5, acc0);
1780 acc1 = fma(values5, w3, acc1);
1781 acc1 = fma(values6, w4, acc1);
1782 acc1 = fma(values7, w5, acc1);
1783
1784 acc0 = fma(values8, w6, acc0);
1785 acc0 = fma(values9, w7, acc0);
1786 acc0 = fma(values10, w8, acc0);
1787 acc1 = fma(values9, w6, acc1);
1788 acc1 = fma(values10, w7, acc1);
1789 acc1 = fma(values11, w8, acc1);
1790
1791 acc2 = fma(values4, w0, acc2);
1792 acc2 = fma(values5, w1, acc2);
1793 acc2 = fma(values6, w2, acc2);
1794 acc3 = fma(values5, w0, acc3);
1795 acc3 = fma(values6, w1, acc3);
1796 acc3 = fma(values7, w2, acc3);
1797
1798 acc2 = fma(values8, w3, acc2);
1799 acc2 = fma(values9, w4, acc2);
1800 acc2 = fma(values10, w5, acc2);
1801 acc3 = fma(values9, w3, acc3);
1802 acc3 = fma(values10, w4, acc3);
1803 acc3 = fma(values11, w5, acc3);
1804
1805 acc2 = fma(values12, w6, acc2);
1806 acc2 = fma(values13, w7, acc2);
1807 acc2 = fma(values14, w8, acc2);
1808 acc3 = fma(values13, w6, acc3);
1809 acc3 = fma(values14, w7, acc3);
1810 acc3 = fma(values15, w8, acc3);
1811
1812#if defined(HAS_BIAS)
1813 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1814
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001815 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001816
1817 acc0 += bias_values;
1818 acc1 += bias_values;
1819 acc2 += bias_values;
1820 acc3 += bias_values;
1821#endif // defined(HAS_BIAS)
1822
Georgios Pinitas37044642018-10-30 14:53:25 +00001823#if defined(DST_DEPTH)
1824 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1825#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001826 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001827#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001828
1829 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001830 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001831 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001832 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001833
1834#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1835 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1836#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1837 {
1838 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001839 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001840 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001841 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001842 }
1843}
1844
1845#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Gian Marco Iodice3ae323f2019-08-13 14:56:50 +01001846#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)