blob: c55a3d91c2f993bf5a9c820d62864db067c3ddb7 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Usama Arif6a98a6e2019-05-10 17:07:27 +010027#include "activation_float_helpers.h"
Manuel Bottinia788c2f2019-04-08 13:18:00 +010028
29/** Get the pointer position at a certain offset in x and y direction.
30 *
31 * @param[in] ptr Pointer to the starting position of the buffer
32 * @param[in] x Relative X position
33 * @param[in] y Relative Y position
34 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
35 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
36 *
37 * @return a uchar
38 */
39inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
40{
41 return ptr + x * stride_x + y * stride_y;
42}
43
44#if(DILATION_X == 1 && DILATION_Y == 1)
45
46#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
47 ({ \
48 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
49 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
50 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
51 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
52 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
53 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
54 })
55
56#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
57 ({ \
58 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
59 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
60 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
61 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
62 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
63 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
64 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
65 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
66 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
67 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
68 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
69 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
70 })
71
72#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
73 ({ \
74 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
75 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
76 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
77 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
78 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
79 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
80 })
81
82#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
83 ({ \
84 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
85 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
86 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
87 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
88 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
89 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
90 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
91 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
92 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
93 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
94 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
95 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
96 })
97
98#else /* DILATION_X==1 && DILATION_Y==1 */
99
100#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
101 ({ \
102 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
103 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
104 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
105 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
106 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
107 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
108 })
109
110#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
111 ({ \
112 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
113 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
114 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
115 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
116 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
117 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
118 })
119
120#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
121 ({ \
122 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
123 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
124 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
125 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
126 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
127 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
128 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
129 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
130 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
131 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
132 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
133 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
134 })
135
136#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
137 ({ \
138 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
139 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
140 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
141 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
142 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
143 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
144 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
145 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
146 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
147 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
148 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
149 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
150 })
151
152#endif /* DILATION_X==1 && DILATION_Y==1 */
153
154#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100155#if defined(CONV_STRIDE_X)
156
Giorgio Arena93a690e2017-08-01 16:09:33 +0100157#if CONV_STRIDE_X == 1
158#define convolution1x3 convolution1x3_stride_1
159#elif CONV_STRIDE_X == 2
160#define convolution1x3 convolution1x3_stride_2
161#elif CONV_STRIDE_X == 3
162#define convolution1x3 convolution1x3_stride_3
163#else /* CONV_STRIDE_X */
164#error "Stride not supported"
165#endif /* CONV_STRIDE_X */
166
167/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
168 *
169 * @param[in] left_pixel Pointer to the left pixel.
170 * @param[in] left_coeff Weight of the left pixel
171 * @param[in] middle_coeff Weight of the middle pixel
172 * @param[in] right_coeff Weight of the right pixel
173 *
174 * @return a float2 containing 2 convoluted values.
175 */
176inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
177 const float left_coeff,
178 const float middle_coeff,
179 const float right_coeff)
180{
Usama Arife73686a2019-04-08 17:30:48 +0100181#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100182 float4 temp = vload4(0, (__global float *)left_pixel);
183
184 float2 left = CONVERT(temp.s01, float2);
185 float2 middle = CONVERT(temp.s12, float2);
186 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100187 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100188#else /* DILATION_X==1 && DILATION_Y==1 */
189 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
190 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
191 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
192#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100193}
194
195/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
196 *
197 * @param[in] left_pixel Pointer to the left pixel.
198 * @param[in] left_coeff Weight of the left pixel
199 * @param[in] middle_coeff Weight of the middle pixel
200 * @param[in] right_coeff Weight of the right pixel
201 *
202 * @return a float2 containing 2 convoluted values.
203 */
204inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
205 const float left_coeff,
206 const float middle_coeff,
207 const float right_coeff)
208{
Usama Arife73686a2019-04-08 17:30:48 +0100209#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100210 float4 temp0 = vload4(0, (__global float *)left_pixel);
211 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
212
213 float2 left = CONVERT(temp0.s02, float2);
214 float2 middle = CONVERT(temp0.s13, float2);
215 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
216
217 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100218#else /* DILATION_X==1 && DILATION_Y==1 */
219 __global float *left_pixel_float = (__global float *)left_pixel;
220
221 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
222 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
223 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
224
225#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100226}
227
228/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
229 *
230 * @param[in] left_pixel Pointer to the left pixel.
231 * @param[in] left_coeff Weight of the left pixel
232 * @param[in] middle_coeff Weight of the middle pixel
233 * @param[in] right_coeff Weight of the right pixel
234 *
235 * @return a float2 containing 2 convoluted values.
236 */
237inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
238 const float left_coeff,
239 const float middle_coeff,
240 const float right_coeff)
241{
Usama Arife73686a2019-04-08 17:30:48 +0100242#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100243 float4 temp0 = vload4(0, (__global float *)left_pixel);
244 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
245
246 float2 left = CONVERT(temp0.s03, float2);
247 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
248 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
249
250 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100251#else /* DILATION_X==1 && DILATION_Y==1 */
252 __global float *left_pixel_float = (__global float *)left_pixel;
253
254 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
255 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
256 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
257#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100258}
259
260/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
261 *
262 * Convolution matrix layout:
263 *
264 * [ mat0, mat1, mat2 ]\n
265 * [ mat3, mat4, mat5 ]\n
266 * [ mat6, mat7, mat8 ]\n
267 *
268 * @param[in] src A pointer to source Image structure
269 * @param[in] mat0 Coefficient from the convolution matrix
270 * @param[in] mat1 Coefficient from the convolution matrix
271 * @param[in] mat2 Coefficient from the convolution matrix
272 * @param[in] mat3 Coefficient from the convolution matrix
273 * @param[in] mat4 Coefficient from the convolution matrix
274 * @param[in] mat5 Coefficient from the convolution matrix
275 * @param[in] mat6 Coefficient from the convolution matrix
276 * @param[in] mat0 Coefficient from the convolution matrix
277 * @param[in] mat7 Coefficient from the convolution matrix
278 * @param[in] mat8 Coefficient from the convolution matrix
279 *
280 * @return a float2 containing 2 convoluted values.
281 */
282inline float2 convolution3x3(
283 Image *src,
284 const float mat0, const float mat1, const float mat2,
285 const float mat3, const float mat4, const float mat5,
286 const float mat6, const float mat7, const float mat8)
287{
288 float2 pixels;
289
290 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +0100291 pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
292 pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100293
294 return pixels;
295}
296
Gian Marcoc799ed82018-02-01 16:57:48 +0000297/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000298 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100299 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
300 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
301 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000302 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
303 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000304 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000305 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000306 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000307 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
309 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000310 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000311 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
312 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
313 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
314 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
315 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
316 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
317 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000318 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000319 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
320 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
321 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
322 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
323 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
324 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
325 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
326 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
327 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
328 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
329 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
330 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100331__kernel void depthwise_convolution_3x3(
332 TENSOR3D_DECLARATION(src),
333 TENSOR3D_DECLARATION(dst),
334 TENSOR3D_DECLARATION(weights)
335#if defined(HAS_BIAS)
336 ,
337 VECTOR_DECLARATION(biases)
338#endif //defined(HAS_BIAS)
339)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100340{
341 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
342 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100343 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100344#if defined(HAS_BIAS)
345 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
346#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100347
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100348 // Extract channel and linearized batch indices
349 const int channel = get_global_id(2) % DST_CHANNELS;
350 const int batch = get_global_id(2) / DST_CHANNELS;
351 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
352 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
353 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100354
Giorgio Arena93a690e2017-08-01 16:09:33 +0100355 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100356 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
357 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
358 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
Giorgio Arena93a690e2017-08-01 16:09:33 +0100359
360 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
361 weights_values1.s0, weights_values1.s1, weights_values1.s2,
362 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100363#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100364 pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100365#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100366
Usama Arif6a98a6e2019-05-10 17:07:27 +0100367 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100368}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100369#endif //defined(CONV_STRIDE_X)
370
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100371#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100372
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100373/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100374 *
375 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
376 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
377 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
378 * @param[in] y_offset Offset from the source tensor from which to start convolution
379 * @param[in] weights_addr Pointer from where to get weights
380 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
381 */
382inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
383 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
384{
385 // Load the weights
386 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
387 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
388 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
389
390 float2 pixels0 = 0.0f;
391
392 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
393 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
394 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
395
396 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
397 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
398 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
399
400 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
401 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
402 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
403
404 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
405 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
406 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
407
408 return pixels0;
409}
410
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100411/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100412 *
413 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
414 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
415 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
416 * @param[in] y_offset Offset from the source tensor from which to start convolution
417 * @param[in] weights_addr Pointer from where to get weights
418 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
419 */
420inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
421 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
422{
423 // Load the weights
424 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
425 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
426 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
427
428 float2 pixels0 = 0.0f;
429
430 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
431 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
432 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
433
434 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
435 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
436 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
437
438 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
439 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
440 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
441
442 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
443 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
444 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
445
446 return pixels0;
447}
448
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100449#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100450
Gian Marcoc799ed82018-02-01 16:57:48 +0000451/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
452 * stride_x and stride_y are equal to 1
453 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100454 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100455 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
456 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
457 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100458 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000459 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
460 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000461 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000462 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000463 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000464 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000465 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
466 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
467 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
468 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
469 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
470 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
471 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
472 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
473 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
474 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
475 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
476 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
477 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
478 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
479 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
480 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
481 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
482 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
483 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
484 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
485 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
486 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
487 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000488__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000489 TENSOR3D_DECLARATION(src),
490 TENSOR3D_DECLARATION(dst),
491 TENSOR3D_DECLARATION(weights)
492#if defined(HAS_BIAS)
493 ,
494 VECTOR_DECLARATION(biases)
495#endif //defined(HAS_BIAS)
496)
497{
498 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
499 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100500 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000501
502 float2 pixels0 = 0.0f;
503 float2 pixels1 = 0.0f;
504 float2 pixels2 = 0.0f;
505 float2 pixels3 = 0.0f;
506
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100507 // Extract channel and linearized batch indices
508 const int channel = get_global_id(2) % DST_CHANNELS;
509 const int batch = get_global_id(2) / DST_CHANNELS;
510 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
511 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
512 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000513
Usama Arife73686a2019-04-08 17:30:48 +0100514#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000515 // Load the weights
516 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
517 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
518 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
519
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000520 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
522 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
523 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
524 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000525 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
526 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000527
528 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
529 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
530 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
531 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
532 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
533 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
534 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
540
Usama Arife73686a2019-04-08 17:30:48 +0100541#else /* DILATION_X==1 && DILATION_Y==1 */
542
543 //3x3 Convolution of elements starting in 0th row
544 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
545 //3x3 Convolution of elements starting in 1st row
546 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
547 //3x3 Convolution of elements starting in 2nd row
548 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
549 //3x3 Convolution of elements starting in 3rd row
550 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
551
552#endif /* DILATION_X==1 && DILATION_Y==1 */
553
Gian Marcoc799ed82018-02-01 16:57:48 +0000554#ifdef HAS_BIAS
555 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
556
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100557 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000558
559 pixels0 += (float2)bias;
560 pixels1 += (float2)bias;
561 pixels2 += (float2)bias;
562 pixels3 += (float2)bias;
563#endif /* defined(HAS_BIAS) */
564
Usama Arif6a98a6e2019-05-10 17:07:27 +0100565 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
566 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
567 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
568 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000569}
570
571/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
572 * stride_x and stride_y are equal to 2
573 *
Usama Arif6a98a6e2019-05-10 17:07:27 +0100574 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100575 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
576 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
577 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100578 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000579 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
580 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000581 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000582 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000583 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000584 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000585 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
586 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
587 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
588 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
589 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
590 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
591 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
592 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
593 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
594 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
595 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
596 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
597 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
598 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
599 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
600 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
601 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
602 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
603 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
604 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
605 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
606 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
607 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000608__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000609 TENSOR3D_DECLARATION(src),
610 TENSOR3D_DECLARATION(dst),
611 TENSOR3D_DECLARATION(weights)
612#if defined(HAS_BIAS)
613 ,
614 VECTOR_DECLARATION(biases)
615#endif //defined(HAS_BIAS)
616)
617{
618 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
619 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100620 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000621
622 float2 pixels0 = 0.0f;
623 float2 pixels1 = 0.0f;
624
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100625 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000626 const int channel = get_global_id(2) % DST_CHANNELS;
627 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100628 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
629 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
630 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000631
Usama Arife73686a2019-04-08 17:30:48 +0100632#if(DILATION_X == 1 && DILATION_Y == 1)
633
Gian Marcoc799ed82018-02-01 16:57:48 +0000634 // Load the weights
635 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
636 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
637 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
638
639 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
640 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
641 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
642 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
643 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
644 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
645 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
646 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
647 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
648 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
649 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
650
651 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
652 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
653 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
654 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
655 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
656 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
657
Usama Arife73686a2019-04-08 17:30:48 +0100658#else /* DILATION_X==1 && DILATION_Y==1 */
659
660 //3x3 Convolution of elements starting in 0th row
661 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
662 //3x3 Convolution of elements starting in 2nd row
663 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
664#endif /* DILATION_X==1 && DILATION_Y==1 */
665
Gian Marcoc799ed82018-02-01 16:57:48 +0000666#ifdef HAS_BIAS
667 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
668
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100669 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000670
671 pixels0 += (float2)bias;
672 pixels1 += (float2)bias;
673#endif /* defined(HAS_BIAS) */
674
Usama Arif6a98a6e2019-05-10 17:07:27 +0100675 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
676 vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000677}
678
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100679#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100680
giuros016d109962019-01-07 17:47:19 +0000681#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
682/** Reshape the weights for quantized depthwise convolution
683 *
684 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
685 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
686 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
687 * @attention Input's height and width should be 3
688 *
689 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
690 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
691 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
692 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
693 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
694 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
695 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
696 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
697 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
698 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
699 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
700 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
701 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
702 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
703 */
704__kernel void depthwise_convolution_reshape_weights(
705 TENSOR3D_DECLARATION(src),
706 IMAGE_DECLARATION(dst))
707{
708 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
709 const int x = get_global_id(0);
710
711 // Load 3x3xVEC_SIZE weights
712 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
713 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
714 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
715 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
716 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
717 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
718 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
719 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
720 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
721 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
722 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
723 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
724 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
725 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
726 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
727 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
728 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
729 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
730
731 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
732
733#if defined(TRANSPOSE)
734#if VEC_SIZE != 4
735#error "VEC_SIZE not supported"
736#else // VEC_SIZE != 4
737 VSTORE(VEC_SIZE)
738 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
739 VSTORE(VEC_SIZE)
740 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
741 VSTORE(VEC_SIZE)
742 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
743 VSTORE(VEC_SIZE)
744 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
745 VSTORE(VEC_SIZE)
746 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
747 VSTORE(VEC_SIZE)
748 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
749 VSTORE(VEC_SIZE)
750 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
751 VSTORE(VEC_SIZE)
752 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
753 VSTORE(VEC_SIZE)
754 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
755#endif // VEC_SIZE != 4
756#else // !defined(TRANSPOSE)
757 VSTORE(VEC_SIZE)
758 (w0, 0, dst_addr + 0);
759 VSTORE(VEC_SIZE)
760 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
761 VSTORE(VEC_SIZE)
762 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
763 VSTORE(VEC_SIZE)
764 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
765 VSTORE(VEC_SIZE)
766 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
767 VSTORE(VEC_SIZE)
768 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
769 VSTORE(VEC_SIZE)
770 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
771 VSTORE(VEC_SIZE)
772 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
773 VSTORE(VEC_SIZE)
774 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
775#endif // defined(TRANSPOSE)
776}
777#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
778
Giorgio Arenad051e972018-06-20 11:46:42 +0100779#if defined(NCHW)
780#define in_stride_x src_stride_x
781#define in_stride_y src_stride_y
782#define in_stride_z src_stride_z
783#define out_stride_x dst_stride_x
784#define out_stride_y dst_stride_y
785#define out_stride_z dst_stride_z
786#else //defined(NCHW)
787#define in_stride_x src_stride_y
788#define in_stride_y src_stride_z
789#define in_stride_z src_stride_x
790#define out_stride_x dst_stride_y
791#define out_stride_y dst_stride_z
792#define out_stride_z dst_stride_x
793#endif //defined(NCHW)
794
Giorgio Arena9fe41442017-08-23 16:36:24 +0100795#if defined(SRC_WIDTH) && defined(DATA_TYPE)
796/** This kernel reshapes each of the tensor's low three dimensions to single rows.
797 *
798 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
799 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100800 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
801 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
802 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
803 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
804 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
805 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
806 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
807 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
808 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
809 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
810 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
811 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
812 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
813 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
814 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
815 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
816 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
817 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100818 */
giuros016d109962019-01-07 17:47:19 +0000819__kernel void depthwise_convolution_reshape_weights_generic(
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100820 TENSOR3D_DECLARATION(src),
821 IMAGE_DECLARATION(dst)
822#ifdef HAS_BIAS
823 ,
824 VECTOR_DECLARATION(biases)
825#endif /* HAS_BIAS */
826)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100827{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100828#ifdef HAS_BIAS
829 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
830#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100831
Giorgio Arenad051e972018-06-20 11:46:42 +0100832 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
833 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100834
Giorgio Arenad051e972018-06-20 11:46:42 +0100835 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100836 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100837 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100838 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100839
840#if defined(HAS_BIAS)
841 if(get_global_id(1) == 0)
842 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100843 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100844 }
845#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100846}
847#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
848
Usama Arife73686a2019-04-08 17:30:48 +0100849#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100850/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
851 *
852 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100853 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Usama Arife73686a2019-04-08 17:30:48 +0100854 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Giorgio Arena9fe41442017-08-23 16:36:24 +0100855 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100856 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100857 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
858 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
859 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
860 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
861 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
862 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
863 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
864 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
865 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
866 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
867 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
868 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
869 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
870 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
871 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
872 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100873__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
874{
875 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
876
877 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100878 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Usama Arife73686a2019-04-08 17:30:48 +0100879 const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100880
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100881 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
882 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100883 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100884
Giorgio Arenad051e972018-06-20 11:46:42 +0100885 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100886 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
887
Usama Arife73686a2019-04-08 17:30:48 +0100888 for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100889 {
Usama Arife73686a2019-04-08 17:30:48 +0100890 for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100891 {
892 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
893 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000894 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100895 }
896 else
897 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100898 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100899 }
900 }
901 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100902#if defined(HAS_BIAS)
903 *output_ptr = (DATA_TYPE)(1);
904#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100905}
906
Giorgio Arena76572242018-04-04 17:44:26 +0100907#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100908
909#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
910
911/** This kernel performs a reshaping of the output of the depthwise generic convolution.
912 *
913 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
914 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
915 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100916 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100917 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
918 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
919 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
920 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
921 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
922 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
923 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
924 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
925 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
926 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
927 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
928 */
929__kernel void depthwise_vector_to_tensor(
930 VECTOR_DECLARATION(src),
931 TENSOR3D_DECLARATION(dst))
932{
933 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
934
935 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
936 const int id0 = get_global_id(0);
937 const int z = id0 / patch_size;
938 const int index2D = id0 - z * patch_size;
939
Giorgio Arenad051e972018-06-20 11:46:42 +0100940 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100941 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
942}
943
944#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000945
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100946#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000947#if defined(CONV_STRIDE_X)
948#if CONV_STRIDE_X == 1
949#define convolution1x3_f16 convolution1x3_stride_1_f16
950#elif CONV_STRIDE_X == 2
951#define convolution1x3_f16 convolution1x3_stride_2_f16
952#elif CONV_STRIDE_X == 3
953#define convolution1x3_f16 convolution1x3_stride_3_f16
954#else /* CONV_STRIDE_X */
955#error "Stride not supported"
956#endif /* CONV_STRIDE_X */
957
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100958#if(DILATION_X > 1 || DILATION_Y > 1)
959
960/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
961 *
962 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
963 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
964 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
965 * @param[in] y_offset Offset from the source tensor from which to start convolution
966 * @param[in] weights_addr Pointer from where to get weights
967 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
968 */
969inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
970 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
971{
972 // Load the weights
973 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
974 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
975 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
976
977 half4 pixels0 = 0.0f;
978
979 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
980 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
981 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
982
983 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
984 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
985 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
986
987 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
988 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
989 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
990
991 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
992 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
993 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
994
995 return pixels0;
996}
997
998/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
999 *
1000 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
1001 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1002 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1003 * @param[in] y_offset Offset from the source tensor from which to start convolution
1004 * @param[in] weights_addr Pointer from where to get weights
1005 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
1006 */
1007inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
1008 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
1009{
1010 // Load the weights
1011 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1012 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1013 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1014
1015 half4 pixels0 = 0.0f;
1016
1017 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
1018 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1019 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1020
1021 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
1022 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1023 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1024
1025 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
1026 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1027 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1028
1029 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
1030 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
1031 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1032
1033 return pixels0;
1034}
1035
1036#endif // (DILATION_X > 1 && DILATION_Y > 1)
1037
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001038/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
1039 *
1040 * @param[in] left_pixel Pointer to the left pixel.
1041 * @param[in] left_coeff Weight of the left pixel
1042 * @param[in] middle_coeff Weight of the middle pixel
1043 * @param[in] right_coeff Weight of the right pixel
1044 *
1045 * @return a half4 containing 4 convoluted values.
1046 */
1047inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
1048 const half left_coeff,
1049 const half middle_coeff,
1050 const half right_coeff)
1051{
Usama Arife73686a2019-04-08 17:30:48 +01001052#if(DILATION_X == 1 && DILATION_Y == 1)
1053
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001054 half8 temp = vload8(0, (__global half *)left_pixel);
1055
1056 half4 left = CONVERT(temp.s0123, half4);
1057 half4 middle = CONVERT(temp.s1234, half4);
1058 half4 right = CONVERT(temp.s2345, half4);
1059
1060 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001061#else /* DILATION_X==1 && DILATION_Y==1 */
1062 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
1063 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
1064 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
1065
1066#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001067}
1068
1069/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
1070 *
1071 * @param[in] left_pixel Pointer to the left pixel.
1072 * @param[in] left_coeff Weight of the left pixel
1073 * @param[in] middle_coeff Weight of the middle pixel
1074 * @param[in] right_coeff Weight of the right pixel
1075 *
1076 * @return a half4 containing 4 convoluted values.
1077 */
1078inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
1079 const half left_coeff,
1080 const half middle_coeff,
1081 const half right_coeff)
1082{
Usama Arife73686a2019-04-08 17:30:48 +01001083#if(DILATION_X == 1 && DILATION_Y == 1)
1084
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001085 half8 temp0 = vload8(0, (__global half *)left_pixel);
1086 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
1087
1088 half4 left = CONVERT(temp0.s0246, half4);
1089 half4 middle = CONVERT(temp0.s1357, half4);
1090 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
1091
1092 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001093#else /* DILATION_X==1 && DILATION_Y==1 */
1094
1095 __global half *left_pixel_float = (__global half *)left_pixel;
1096
1097 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
1098 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
1099 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
1100
1101#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001102}
1103
1104/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
1105 *
1106 * @param[in] left_pixel Pointer to the left pixel.
1107 * @param[in] left_coeff Weight of the left pixel
1108 * @param[in] middle_coeff Weight of the middle pixel
1109 * @param[in] right_coeff Weight of the right pixel
1110 *
1111 * @return a half4 containing 4 convoluted values.
1112 */
1113inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
1114 const half left_coeff,
1115 const half middle_coeff,
1116 const half right_coeff)
1117{
Usama Arife73686a2019-04-08 17:30:48 +01001118#if(DILATION_X == 1 && DILATION_Y == 1)
1119
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001120 half16 temp0 = vload16(0, (__global half *)left_pixel);
1121
1122 half4 left = CONVERT(temp0.s0369, half4);
1123 half4 middle = CONVERT(temp0.s147A, half4);
1124 half4 right = CONVERT(temp0.s258B, half4);
1125
1126 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001127#else /* DILATION_X==1 && DILATION_Y==1 */
1128
1129 __global half *left_pixel_float = (__global half *)left_pixel;
1130
1131 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
1132 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
1133 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
1134
1135#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001136}
1137
1138/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
1139 *
1140 * Convolution matrix layout:
1141 *
1142 * [ mat0, mat1, mat2 ]\n
1143 * [ mat3, mat4, mat5 ]\n
1144 * [ mat6, mat7, mat8 ]\n
1145 *
1146 * @param[in] src A pointer to source Image structure
1147 * @param[in] mat0 Coefficient from the convolution matrix
1148 * @param[in] mat1 Coefficient from the convolution matrix
1149 * @param[in] mat2 Coefficient from the convolution matrix
1150 * @param[in] mat3 Coefficient from the convolution matrix
1151 * @param[in] mat4 Coefficient from the convolution matrix
1152 * @param[in] mat5 Coefficient from the convolution matrix
1153 * @param[in] mat6 Coefficient from the convolution matrix
1154 * @param[in] mat0 Coefficient from the convolution matrix
1155 * @param[in] mat7 Coefficient from the convolution matrix
1156 * @param[in] mat8 Coefficient from the convolution matrix
1157 *
1158 * @return a half4 containing 4 convoluted values.
1159 */
1160inline half4 convolution3x3_f16(
1161 Image *src,
1162 const half mat0, const half mat1, const half mat2,
1163 const half mat3, const half mat4, const half mat5,
1164 const half mat6, const half mat7, const half mat8)
1165{
1166 half4 pixels;
1167
1168 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001169 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1170 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001171
1172 return pixels;
1173}
1174
Giorgio Arena76572242018-04-04 17:44:26 +01001175#if defined(DEPTH_MULTIPLIER)
1176
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001177/** This OpenCL kernel computes the depthwise convolution 3x3
1178 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001179 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001180 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1181 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1182 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001183 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001184 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1185 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001186 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001187 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001188 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001189 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1190 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001191 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001192 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001193 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1194 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1195 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1196 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1197 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1198 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1199 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001200 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001201 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1202 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1203 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1204 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1205 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1206 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1207 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001208 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001209 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1210 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1211 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1212 */
1213__kernel void depthwise_convolution_3x3_f16(
1214 TENSOR3D_DECLARATION(src),
1215 TENSOR3D_DECLARATION(dst),
1216 TENSOR3D_DECLARATION(weights)
1217#if defined(HAS_BIAS)
1218 ,
1219 VECTOR_DECLARATION(biases)
1220#endif //defined(HAS_BIAS)
1221)
1222{
1223 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1224 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001225 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001226#if defined(HAS_BIAS)
1227 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1228#endif //defined(HAS_BIAS)
1229
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001230 // Extract channel and linearized batch indices
1231 const int channel = get_global_id(2) % DST_CHANNELS;
1232 const int batch = get_global_id(2) / DST_CHANNELS;
1233 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1234 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1235 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001236
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001237 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001238 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1239 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1240 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001241
1242 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1243 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1244 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1245#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001246 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001247#endif //defined(HAS_BIAS)
1248
Usama Arif6a98a6e2019-05-10 17:07:27 +01001249 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001250}
Giorgio Arena76572242018-04-04 17:44:26 +01001251#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001252#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001253
1254/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1255 * when both stride_x and stride_y are equal to 1
1256 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001257 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001258 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1259 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1260 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001261 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001262 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1263 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001264 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001265 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001266 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001267 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1268 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001269 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001270 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1271 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1272 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1273 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1274 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1275 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1276 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1277 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1278 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1279 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1280 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1281 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1282 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1283 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1284 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1285 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1286 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1287 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1288 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1289 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1290 */
1291__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1292 TENSOR3D_DECLARATION(src),
1293 TENSOR3D_DECLARATION(dst),
1294 TENSOR3D_DECLARATION(weights)
1295#if defined(HAS_BIAS)
1296 ,
1297 VECTOR_DECLARATION(biases)
1298#endif //defined(HAS_BIAS)
1299)
1300{
1301 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1302 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001303 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1304
1305 // Extract channel and linearized batch indices
1306 const int channel = get_global_id(2) % DST_CHANNELS;
1307 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001308
1309#ifdef HAS_BIAS
1310 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1311
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001312 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001313#endif /* defined(HAS_BIAS) */
1314
1315 half4 pixels0 = 0.0f;
1316 half4 pixels1 = 0.0f;
1317 half4 pixels2 = 0.0f;
1318 half4 pixels3 = 0.0f;
1319
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001320 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1321 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1322 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001323
Usama Arife73686a2019-04-08 17:30:48 +01001324#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001325 // Load the weights
1326 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1327 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1328 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1329
1330 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1331 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1332 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1333 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1334 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1335 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1336 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1337
1338 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1339 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1340 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1341 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1342 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1343 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1344 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1345 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1346 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1347 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1348 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1349 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1350
Usama Arife73686a2019-04-08 17:30:48 +01001351#else /* DILATION_X==1 && DILATION_Y==1 */
1352
1353 //3x3 Convolution of elements starting in 0th row
1354 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1355 //3x3 Convolution of elements starting in 1st row
1356 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1357 //3x3 Convolution of elements starting in 2nd row
1358 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1359 //3x3 Convolution of elements starting in 3rd row
1360 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1361
1362#endif /* DILATION_X==1 && DILATION_Y==1 */
1363
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001364#ifdef HAS_BIAS
1365 pixels0 += (half4)bias;
1366 pixels1 += (half4)bias;
1367 pixels2 += (half4)bias;
1368 pixels3 += (half4)bias;
1369#endif /* defined(HAS_BIAS) */
1370
Usama Arif6a98a6e2019-05-10 17:07:27 +01001371 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1372 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1373 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1374 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001375}
1376
1377/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1378 * when both stride_x and stride_y are equal to 2
1379 *
Usama Arif6a98a6e2019-05-10 17:07:27 +01001380 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001381 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1382 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1383 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001384 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001385 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1386 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001387 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001388 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001389 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001390 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001391 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1392 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001393 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1394 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1395 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1396 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1397 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1398 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1399 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1400 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1401 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1402 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1403 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1404 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1405 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1406 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1407 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1408 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1409 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1410 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1411 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1412 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1413 */
1414__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1415 TENSOR3D_DECLARATION(src),
1416 TENSOR3D_DECLARATION(dst),
1417 TENSOR3D_DECLARATION(weights)
1418#if defined(HAS_BIAS)
1419 ,
1420 VECTOR_DECLARATION(biases)
1421#endif //defined(HAS_BIAS)
1422)
1423{
1424 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1425 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001426 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1427
1428 // Extract channel and linearized batch indices
1429 const int channel = get_global_id(2) % DST_CHANNELS;
1430 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001431
1432#ifdef HAS_BIAS
1433 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1434
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001435 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001436#endif /* defined(HAS_BIAS) */
1437
1438 half4 pixels0 = 0.0f;
1439 half4 pixels1 = 0.0f;
1440
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001441 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1442 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1443 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001444
Usama Arife73686a2019-04-08 17:30:48 +01001445#if(DILATION_X == 1 && DILATION_Y == 1)
1446
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001447 // Load the weights
1448 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1449 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1450 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1451
1452 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1453 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1454 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1455 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1456 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1457 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1458 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1459 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1460 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1461 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1462 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1463
1464 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1465 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1466 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1467 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1468 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1469 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1470
Usama Arife73686a2019-04-08 17:30:48 +01001471#else /* DILATION_X==1 && DILATION_Y==1 */
1472 //3x3 Convolution of elements starting in 0th row
1473 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1474 //3x3 Convolution of elements starting in 2nd row
1475 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1476#endif /* DILATION_X==1 && DILATION_Y==1 */
1477
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001478#ifdef HAS_BIAS
1479 pixels0 += (half4)bias;
1480 pixels1 += (half4)bias;
1481#endif /* defined(HAS_BIAS) */
1482
Usama Arif6a98a6e2019-05-10 17:07:27 +01001483 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1484 vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001485}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001486#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001487
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001488#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001489
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001490#if DATA_TYPE != float || DATA_TYPE != half
1491#error "Unsupported data type"
1492#endif // DATA_TYPE != float || DATA_TYPE != half
1493
1494#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001495
1496#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1497/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1498 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001499 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001500 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1501 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1502 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1503 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1504 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1505 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001506 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001507 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1508 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001509 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001510 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001511 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001512 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001513 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001514 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001515 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001516 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1517 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1518 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1519 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001520 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1521 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1522 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1523 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1524 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1525 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1526 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001527 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1528 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001529 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001530 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001531 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1532 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1533 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1534 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1535 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1536 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1537 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1538 * @param[in] max_offset Max offset for the input tensor
1539 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1540 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1541 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1542 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1543 */
1544__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001545 TENSOR4D_DECLARATION(src),
1546 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001547 TENSOR3D_DECLARATION(weights),
1548#if defined(HAS_BIAS)
1549 VECTOR_DECLARATION(biases),
1550#endif /* defined(HAS_BIAS) */
1551 int max_offset)
1552{
1553 int x = get_global_id(0); // channels
1554 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001555#if defined(DST_DEPTH)
1556 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1557 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001558#else // defined(DST_DEPTH)
1559 int z = get_global_id(2); // spatial coordinate y
1560#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001561
1562 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1563
Georgios Pinitas37044642018-10-30 14:53:25 +00001564#if defined(DST_DEPTH)
1565 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1566#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001567 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001568#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001569
1570 int z_coord = 0;
1571 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001572 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001573
1574 // We compute 2x1x1 [C,W,H] elements
1575 VEC_FLOAT acc = 0;
1576
1577 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001578 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1579 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1580 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1581 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1582 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1583 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1584 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1585 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1586 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001587
1588 // Load input values
1589 // z == 0
1590 // Clamp z_coord as for z = 0, it can be negative
1591 // z_coord is casted to unsigned int in order to use just a min() operation
1592 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1593 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1594 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1595 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001596 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001597
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001598 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1599 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1600 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001601
1602 // z == 1
1603 // z_coord can be only negative for z = 0 so we do not need to clamp it
1604 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001605 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001606 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001607 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1608 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1609 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001610
1611 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001612 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1613 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1614 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001615 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001616 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1617 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1618 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001619
1620 acc = fma(values0, w0, acc);
1621 acc = fma(values1, w1, acc);
1622 acc = fma(values2, w2, acc);
1623
1624 acc = fma(values3, w3, acc);
1625 acc = fma(values4, w4, acc);
1626 acc = fma(values5, w5, acc);
1627
1628 acc = fma(values6, w6, acc);
1629 acc = fma(values7, w7, acc);
1630 acc = fma(values8, w8, acc);
1631
1632#if defined(HAS_BIAS)
1633 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001634 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001635 acc += bias_values;
1636#endif // defined(HAS_BIAS)
1637
Georgios Pinitas37044642018-10-30 14:53:25 +00001638#if defined(DST_DEPTH)
1639 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1640#else /* defined(DST_DEPTH) */
1641 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1642#endif /* defined(DST_DEPTH) */
1643
Giorgio Arenad051e972018-06-20 11:46:42 +01001644 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001645 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001646}
1647#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1648
1649#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1650/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1651 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001652 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001653 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1654 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1655 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1656 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1657 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1658 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001659 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001660 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1661 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
Giorgio Arenad051e972018-06-20 11:46:42 +01001662 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001663 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001664 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001665 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001666 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001667 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001668 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001669 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1670 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1671 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1672 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001673 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1674 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1675 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1676 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1677 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1678 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1679 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001680 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1681 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001682 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001683 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001684 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1685 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1686 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1687 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1688 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1689 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1690 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1691 * @param[in] max_offset Max offset for the input tensor
1692 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1693 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1694 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1695 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1696 */
1697__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001698 TENSOR4D_DECLARATION(src),
1699 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001700 TENSOR3D_DECLARATION(weights),
1701#if defined(HAS_BIAS)
1702 VECTOR_DECLARATION(biases),
1703#endif /* defined(HAS_BIAS) */
1704 int max_offset)
1705{
1706 int x = get_global_id(0); // channels
1707 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001708#if defined(DST_DEPTH)
1709 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1710 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001711#else // defined(DST_DEPTH)
1712 int z = get_global_id(2); // spatial coordinate y
1713#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001714
1715 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1716
Georgios Pinitas37044642018-10-30 14:53:25 +00001717#if defined(DST_DEPTH)
1718 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1719#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001720 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001721#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001722
1723 int z_coord = 0;
1724 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001725 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001726
1727 // We compute 2x2x2 [C,W,H] elements
1728 VEC_FLOAT acc0 = 0;
1729 VEC_FLOAT acc1 = 0;
1730 VEC_FLOAT acc2 = 0;
1731 VEC_FLOAT acc3 = 0;
1732
1733 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001734 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1735 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1736 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1737 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1738 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1739 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1740 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1741 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1742 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001743
1744 // Load input values
1745 // z == 0
1746 // Clamp z_coord as for z = 0, it can be negative
1747 // z_coord is casted to unsigned int in order to use just a min() operation
1748 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001749 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001750 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1751 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001752 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001753
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001754 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1755 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1756 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1757 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001758
1759 // z == 1
1760 // z_coord can be only negative for z = 0 so we do not need to clamp it
1761 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001762 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001763 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001764 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1765 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1766 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1767 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001768
1769 // z == 2
1770 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1771 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1772 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001773 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001774 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1775 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1776 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1777 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001778
1779 // z == 3
1780 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1781 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001782 offset += (int4)src_stride_z;
1783 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001784 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1785 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1786 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1787 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001788
1789 acc0 = fma(values0, w0, acc0);
1790 acc0 = fma(values1, w1, acc0);
1791 acc0 = fma(values2, w2, acc0);
1792 acc1 = fma(values1, w0, acc1);
1793 acc1 = fma(values2, w1, acc1);
1794 acc1 = fma(values3, w2, acc1);
1795
1796 acc0 = fma(values4, w3, acc0);
1797 acc0 = fma(values5, w4, acc0);
1798 acc0 = fma(values6, w5, acc0);
1799 acc1 = fma(values5, w3, acc1);
1800 acc1 = fma(values6, w4, acc1);
1801 acc1 = fma(values7, w5, acc1);
1802
1803 acc0 = fma(values8, w6, acc0);
1804 acc0 = fma(values9, w7, acc0);
1805 acc0 = fma(values10, w8, acc0);
1806 acc1 = fma(values9, w6, acc1);
1807 acc1 = fma(values10, w7, acc1);
1808 acc1 = fma(values11, w8, acc1);
1809
1810 acc2 = fma(values4, w0, acc2);
1811 acc2 = fma(values5, w1, acc2);
1812 acc2 = fma(values6, w2, acc2);
1813 acc3 = fma(values5, w0, acc3);
1814 acc3 = fma(values6, w1, acc3);
1815 acc3 = fma(values7, w2, acc3);
1816
1817 acc2 = fma(values8, w3, acc2);
1818 acc2 = fma(values9, w4, acc2);
1819 acc2 = fma(values10, w5, acc2);
1820 acc3 = fma(values9, w3, acc3);
1821 acc3 = fma(values10, w4, acc3);
1822 acc3 = fma(values11, w5, acc3);
1823
1824 acc2 = fma(values12, w6, acc2);
1825 acc2 = fma(values13, w7, acc2);
1826 acc2 = fma(values14, w8, acc2);
1827 acc3 = fma(values13, w6, acc3);
1828 acc3 = fma(values14, w7, acc3);
1829 acc3 = fma(values15, w8, acc3);
1830
1831#if defined(HAS_BIAS)
1832 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1833
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001834 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001835
1836 acc0 += bias_values;
1837 acc1 += bias_values;
1838 acc2 += bias_values;
1839 acc3 += bias_values;
1840#endif // defined(HAS_BIAS)
1841
Georgios Pinitas37044642018-10-30 14:53:25 +00001842#if defined(DST_DEPTH)
1843 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1844#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001845 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001846#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001847
1848 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001849 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc0, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001850 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001851 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc1, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001852
1853#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1854 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1855#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1856 {
1857 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001858 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc2, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001859 VSTORE(VEC_SIZE)
Usama Arif6a98a6e2019-05-10 17:07:27 +01001860 (ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, acc3, A_VAL, B_VAL), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001861 }
1862}
1863
1864#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
giuros016d109962019-01-07 17:47:19 +00001865#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)