blob: a8611af98ee58d4a435d281efa414c42715cd66c [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Manuel Bottinia788c2f2019-04-08 13:18:00 +010027#if defined(FUSED_ACTIVATION)
28#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
29#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
30#include "activation_helpers.h"
31#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
32#else /* defined(FUSED_ACTIVATION) */
33#define ACTIVATION_FUNC(x) (x)
34#endif /* defined(FUSED_ACTIVATION) */
35
36/** Get the pointer position at a certain offset in x and y direction.
37 *
38 * @param[in] ptr Pointer to the starting position of the buffer
39 * @param[in] x Relative X position
40 * @param[in] y Relative Y position
41 * @param[in] stride_x Stride of the source tensor in X dimension (in bytes)
42 * @param[in] stride_y Stride of the source tensor in Y dimension (in bytes)
43 *
44 * @return a uchar
45 */
46inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
47{
48 return ptr + x * stride_x + y * stride_y;
49}
50
51#if(DILATION_X == 1 && DILATION_Y == 1)
52
53#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
54 ({ \
55 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
56 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
57 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
58 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
59 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
60 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
61 })
62
63#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
64 ({ \
65 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
66 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
67 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
68 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
69 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
70 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
71 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
72 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
73 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
74 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
75 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
76 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
77 })
78
79#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
80 ({ \
81 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
82 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
83 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
84 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
85 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
86 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
87 })
88
89#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
90 ({ \
91 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
92 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
93 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
94 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
95 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
96 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
97 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
98 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
99 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
100 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
101 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
102 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
103 })
104
105#else /* DILATION_X==1 && DILATION_Y==1 */
106
107#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
108 ({ \
109 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
110 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
111 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
112 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
113 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
114 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
115 })
116
117#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
118 ({ \
119 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
120 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
121 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
122 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
123 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
124 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
125 })
126
127#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
128 ({ \
129 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
130 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
131 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
132 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
133 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
134 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
135 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
136 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
137 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
138 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
139 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
140 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
141 })
142
143#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
144 ({ \
145 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
146 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
147 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
148 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
149 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
150 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
151 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
152 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
153 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
154 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
155 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
156 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
157 })
158
159#endif /* DILATION_X==1 && DILATION_Y==1 */
160
161#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100162#if defined(CONV_STRIDE_X)
163
Giorgio Arena93a690e2017-08-01 16:09:33 +0100164#if CONV_STRIDE_X == 1
165#define convolution1x3 convolution1x3_stride_1
166#elif CONV_STRIDE_X == 2
167#define convolution1x3 convolution1x3_stride_2
168#elif CONV_STRIDE_X == 3
169#define convolution1x3 convolution1x3_stride_3
170#else /* CONV_STRIDE_X */
171#error "Stride not supported"
172#endif /* CONV_STRIDE_X */
173
174/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
175 *
176 * @param[in] left_pixel Pointer to the left pixel.
177 * @param[in] left_coeff Weight of the left pixel
178 * @param[in] middle_coeff Weight of the middle pixel
179 * @param[in] right_coeff Weight of the right pixel
180 *
181 * @return a float2 containing 2 convoluted values.
182 */
183inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
184 const float left_coeff,
185 const float middle_coeff,
186 const float right_coeff)
187{
Usama Arife73686a2019-04-08 17:30:48 +0100188#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100189 float4 temp = vload4(0, (__global float *)left_pixel);
190
191 float2 left = CONVERT(temp.s01, float2);
192 float2 middle = CONVERT(temp.s12, float2);
193 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100194 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100195#else /* DILATION_X==1 && DILATION_Y==1 */
196 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
197 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
198 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
199#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100200}
201
202/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
203 *
204 * @param[in] left_pixel Pointer to the left pixel.
205 * @param[in] left_coeff Weight of the left pixel
206 * @param[in] middle_coeff Weight of the middle pixel
207 * @param[in] right_coeff Weight of the right pixel
208 *
209 * @return a float2 containing 2 convoluted values.
210 */
211inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
212 const float left_coeff,
213 const float middle_coeff,
214 const float right_coeff)
215{
Usama Arife73686a2019-04-08 17:30:48 +0100216#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100217 float4 temp0 = vload4(0, (__global float *)left_pixel);
218 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
219
220 float2 left = CONVERT(temp0.s02, float2);
221 float2 middle = CONVERT(temp0.s13, float2);
222 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
223
224 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100225#else /* DILATION_X==1 && DILATION_Y==1 */
226 __global float *left_pixel_float = (__global float *)left_pixel;
227
228 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
229 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
230 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
231
232#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100233}
234
235/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
236 *
237 * @param[in] left_pixel Pointer to the left pixel.
238 * @param[in] left_coeff Weight of the left pixel
239 * @param[in] middle_coeff Weight of the middle pixel
240 * @param[in] right_coeff Weight of the right pixel
241 *
242 * @return a float2 containing 2 convoluted values.
243 */
244inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
245 const float left_coeff,
246 const float middle_coeff,
247 const float right_coeff)
248{
Usama Arife73686a2019-04-08 17:30:48 +0100249#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100250 float4 temp0 = vload4(0, (__global float *)left_pixel);
251 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
252
253 float2 left = CONVERT(temp0.s03, float2);
254 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
255 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
256
257 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100258#else /* DILATION_X==1 && DILATION_Y==1 */
259 __global float *left_pixel_float = (__global float *)left_pixel;
260
261 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
262 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
263 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
264#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100265}
266
267/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
268 *
269 * Convolution matrix layout:
270 *
271 * [ mat0, mat1, mat2 ]\n
272 * [ mat3, mat4, mat5 ]\n
273 * [ mat6, mat7, mat8 ]\n
274 *
275 * @param[in] src A pointer to source Image structure
276 * @param[in] mat0 Coefficient from the convolution matrix
277 * @param[in] mat1 Coefficient from the convolution matrix
278 * @param[in] mat2 Coefficient from the convolution matrix
279 * @param[in] mat3 Coefficient from the convolution matrix
280 * @param[in] mat4 Coefficient from the convolution matrix
281 * @param[in] mat5 Coefficient from the convolution matrix
282 * @param[in] mat6 Coefficient from the convolution matrix
283 * @param[in] mat0 Coefficient from the convolution matrix
284 * @param[in] mat7 Coefficient from the convolution matrix
285 * @param[in] mat8 Coefficient from the convolution matrix
286 *
287 * @return a float2 containing 2 convoluted values.
288 */
289inline float2 convolution3x3(
290 Image *src,
291 const float mat0, const float mat1, const float mat2,
292 const float mat3, const float mat4, const float mat5,
293 const float mat6, const float mat7, const float mat8)
294{
295 float2 pixels;
296
297 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +0100298 pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
299 pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100300
301 return pixels;
302}
303
Gian Marcoc799ed82018-02-01 16:57:48 +0000304/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000305 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000306 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
307 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000308 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000309 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000310 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000311 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000312 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
313 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000314 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000315 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
316 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
317 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
318 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
319 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
320 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
321 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000322 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000323 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
324 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
325 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
326 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
327 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
328 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
329 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
330 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
331 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
332 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
333 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
334 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100335__kernel void depthwise_convolution_3x3(
336 TENSOR3D_DECLARATION(src),
337 TENSOR3D_DECLARATION(dst),
338 TENSOR3D_DECLARATION(weights)
339#if defined(HAS_BIAS)
340 ,
341 VECTOR_DECLARATION(biases)
342#endif //defined(HAS_BIAS)
343)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100344{
345 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
346 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100347 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100348#if defined(HAS_BIAS)
349 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
350#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100351
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100352 // Extract channel and linearized batch indices
353 const int channel = get_global_id(2) % DST_CHANNELS;
354 const int batch = get_global_id(2) / DST_CHANNELS;
355 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
356 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
357 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100358
Giorgio Arena93a690e2017-08-01 16:09:33 +0100359 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100360 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
361 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
362 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
Giorgio Arena93a690e2017-08-01 16:09:33 +0100363
364 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
365 weights_values1.s0, weights_values1.s1, weights_values1.s2,
366 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100367#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100368 pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100369#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100370
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100371 vstore2(ACTIVATION_FUNC(pixels), 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100372}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100373#endif //defined(CONV_STRIDE_X)
374
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100375#if(DILATION_X > 1 || DILATION_Y > 1)
Usama Arife73686a2019-04-08 17:30:48 +0100376
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100377/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100378 *
379 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
380 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
381 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
382 * @param[in] y_offset Offset from the source tensor from which to start convolution
383 * @param[in] weights_addr Pointer from where to get weights
384 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
385 */
386inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
387 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
388{
389 // Load the weights
390 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
391 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
392 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
393
394 float2 pixels0 = 0.0f;
395
396 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
397 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
398 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
399
400 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
401 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
402 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
403
404 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
405 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
406 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
407
408 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
409 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
410 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
411
412 return pixels0;
413}
414
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100415/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32
Usama Arife73686a2019-04-08 17:30:48 +0100416 *
417 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
418 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
419 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
420 * @param[in] y_offset Offset from the source tensor from which to start convolution
421 * @param[in] weights_addr Pointer from where to get weights
422 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
423 */
424inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
425 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
426{
427 // Load the weights
428 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
429 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
430 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
431
432 float2 pixels0 = 0.0f;
433
434 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
435 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
436 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
437
438 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
439 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
440 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
441
442 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
443 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
444 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
445
446 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
447 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
449
450 return pixels0;
451}
452
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100453#endif /* (DILATION_X > 1 || DILATION_Y > 1) */
Usama Arife73686a2019-04-08 17:30:48 +0100454
Gian Marcoc799ed82018-02-01 16:57:48 +0000455/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
456 * stride_x and stride_y are equal to 1
457 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100458 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
459 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
460 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
461 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
462 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
463 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000464 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
465 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000466 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000467 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000468 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000469 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000470 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
471 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
472 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
473 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
474 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
475 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
476 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
477 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
478 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
479 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
480 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
481 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
482 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
483 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
484 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
485 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
486 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
487 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
488 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
489 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
490 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
491 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
492 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000493__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000494 TENSOR3D_DECLARATION(src),
495 TENSOR3D_DECLARATION(dst),
496 TENSOR3D_DECLARATION(weights)
497#if defined(HAS_BIAS)
498 ,
499 VECTOR_DECLARATION(biases)
500#endif //defined(HAS_BIAS)
501)
502{
503 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
504 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100505 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000506
507 float2 pixels0 = 0.0f;
508 float2 pixels1 = 0.0f;
509 float2 pixels2 = 0.0f;
510 float2 pixels3 = 0.0f;
511
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100512 // Extract channel and linearized batch indices
513 const int channel = get_global_id(2) % DST_CHANNELS;
514 const int batch = get_global_id(2) / DST_CHANNELS;
515 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
516 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
517 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000518
Usama Arife73686a2019-04-08 17:30:48 +0100519#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000520 // Load the weights
521 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
522 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
523 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
524
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000525 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000526 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
527 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
528 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
529 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000530 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
531 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000532
533 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
534 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
535 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
536 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
537 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
538 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
539 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
540 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
541 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
542 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
543 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
544 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
545
Usama Arife73686a2019-04-08 17:30:48 +0100546#else /* DILATION_X==1 && DILATION_Y==1 */
547
548 //3x3 Convolution of elements starting in 0th row
549 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
550 //3x3 Convolution of elements starting in 1st row
551 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
552 //3x3 Convolution of elements starting in 2nd row
553 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
554 //3x3 Convolution of elements starting in 3rd row
555 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
556
557#endif /* DILATION_X==1 && DILATION_Y==1 */
558
Gian Marcoc799ed82018-02-01 16:57:48 +0000559#ifdef HAS_BIAS
560 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
561
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100562 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000563
564 pixels0 += (float2)bias;
565 pixels1 += (float2)bias;
566 pixels2 += (float2)bias;
567 pixels3 += (float2)bias;
568#endif /* defined(HAS_BIAS) */
569
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100570 vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
571 vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
572 vstore2(ACTIVATION_FUNC(pixels2), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
573 vstore2(ACTIVATION_FUNC(pixels3), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000574}
575
576/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
577 * stride_x and stride_y are equal to 2
578 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100579 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
580 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
581 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
582 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
583 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=float
584 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000585 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
586 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000587 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000588 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000589 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000590 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000591 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
592 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
593 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
594 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
595 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
596 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
597 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
598 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
599 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
600 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
601 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
602 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
603 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
604 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
605 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
606 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
607 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
608 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
609 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
610 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
611 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
612 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
613 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000614__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000615 TENSOR3D_DECLARATION(src),
616 TENSOR3D_DECLARATION(dst),
617 TENSOR3D_DECLARATION(weights)
618#if defined(HAS_BIAS)
619 ,
620 VECTOR_DECLARATION(biases)
621#endif //defined(HAS_BIAS)
622)
623{
624 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
625 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100626 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000627
628 float2 pixels0 = 0.0f;
629 float2 pixels1 = 0.0f;
630
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100631 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000632 const int channel = get_global_id(2) % DST_CHANNELS;
633 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100634 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
635 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
636 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000637
Usama Arife73686a2019-04-08 17:30:48 +0100638#if(DILATION_X == 1 && DILATION_Y == 1)
639
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 // Load the weights
641 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
644
645 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
646 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
647 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
648 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
649 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
650 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
651 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
652 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
653 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
654 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
655 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
656
657 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
658 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
659 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
660 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
661 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
662 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
663
Usama Arife73686a2019-04-08 17:30:48 +0100664#else /* DILATION_X==1 && DILATION_Y==1 */
665
666 //3x3 Convolution of elements starting in 0th row
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
668 //3x3 Convolution of elements starting in 2nd row
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
670#endif /* DILATION_X==1 && DILATION_Y==1 */
671
Gian Marcoc799ed82018-02-01 16:57:48 +0000672#ifdef HAS_BIAS
673 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
674
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
679#endif /* defined(HAS_BIAS) */
680
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100681 vstore2(ACTIVATION_FUNC(pixels0), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
682 vstore2(ACTIVATION_FUNC(pixels1), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
Gian Marcoc799ed82018-02-01 16:57:48 +0000683}
684
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100685#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)
Giorgio Arena76572242018-04-04 17:44:26 +0100686
giuros016d109962019-01-07 17:47:19 +0000687#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
688/** Reshape the weights for quantized depthwise convolution
689 *
690 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
691 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
692 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
693 * @attention Input's height and width should be 3
694 *
695 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
696 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
697 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
698 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
699 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
700 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
701 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
702 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
703 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
704 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
705 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
706 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
707 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
708 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
709 */
710__kernel void depthwise_convolution_reshape_weights(
711 TENSOR3D_DECLARATION(src),
712 IMAGE_DECLARATION(dst))
713{
714 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
715 const int x = get_global_id(0);
716
717 // Load 3x3xVEC_SIZE weights
718 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
719 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
720 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
721 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
722 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
723 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
724 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
725 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
726 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
727 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
728 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
729 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
730 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
731 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
732 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
733 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
734 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
735 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
736
737 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
738
739#if defined(TRANSPOSE)
740#if VEC_SIZE != 4
741#error "VEC_SIZE not supported"
742#else // VEC_SIZE != 4
743 VSTORE(VEC_SIZE)
744 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
745 VSTORE(VEC_SIZE)
746 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
747 VSTORE(VEC_SIZE)
748 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
749 VSTORE(VEC_SIZE)
750 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
751 VSTORE(VEC_SIZE)
752 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
753 VSTORE(VEC_SIZE)
754 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
755 VSTORE(VEC_SIZE)
756 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
757 VSTORE(VEC_SIZE)
758 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
759 VSTORE(VEC_SIZE)
760 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
761#endif // VEC_SIZE != 4
762#else // !defined(TRANSPOSE)
763 VSTORE(VEC_SIZE)
764 (w0, 0, dst_addr + 0);
765 VSTORE(VEC_SIZE)
766 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
767 VSTORE(VEC_SIZE)
768 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
769 VSTORE(VEC_SIZE)
770 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
771 VSTORE(VEC_SIZE)
772 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
773 VSTORE(VEC_SIZE)
774 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
775 VSTORE(VEC_SIZE)
776 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
777 VSTORE(VEC_SIZE)
778 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
779 VSTORE(VEC_SIZE)
780 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
781#endif // defined(TRANSPOSE)
782}
783#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
784
Giorgio Arenad051e972018-06-20 11:46:42 +0100785#if defined(NCHW)
786#define in_stride_x src_stride_x
787#define in_stride_y src_stride_y
788#define in_stride_z src_stride_z
789#define out_stride_x dst_stride_x
790#define out_stride_y dst_stride_y
791#define out_stride_z dst_stride_z
792#else //defined(NCHW)
793#define in_stride_x src_stride_y
794#define in_stride_y src_stride_z
795#define in_stride_z src_stride_x
796#define out_stride_x dst_stride_y
797#define out_stride_y dst_stride_z
798#define out_stride_z dst_stride_x
799#endif //defined(NCHW)
800
Giorgio Arena9fe41442017-08-23 16:36:24 +0100801#if defined(SRC_WIDTH) && defined(DATA_TYPE)
802/** This kernel reshapes each of the tensor's low three dimensions to single rows.
803 *
804 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
805 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100806 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
807 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
808 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
809 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
810 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
811 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
812 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
813 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
814 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
815 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
816 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
817 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
818 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
819 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
820 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
821 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
822 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
823 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100824 */
giuros016d109962019-01-07 17:47:19 +0000825__kernel void depthwise_convolution_reshape_weights_generic(
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100826 TENSOR3D_DECLARATION(src),
827 IMAGE_DECLARATION(dst)
828#ifdef HAS_BIAS
829 ,
830 VECTOR_DECLARATION(biases)
831#endif /* HAS_BIAS */
832)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100833{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100834#ifdef HAS_BIAS
835 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
836#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100837
Giorgio Arenad051e972018-06-20 11:46:42 +0100838 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
839 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100840
Giorgio Arenad051e972018-06-20 11:46:42 +0100841 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100842 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100843 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100844 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100845
846#if defined(HAS_BIAS)
847 if(get_global_id(1) == 0)
848 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100849 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100850 }
851#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100852}
853#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
854
Usama Arife73686a2019-04-08 17:30:48 +0100855#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100856/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
857 *
858 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100859 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Usama Arife73686a2019-04-08 17:30:48 +0100860 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Giorgio Arena9fe41442017-08-23 16:36:24 +0100861 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100862 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100863 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
864 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
865 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
866 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
867 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
868 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
869 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
870 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
871 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
872 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
873 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
874 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
875 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
876 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
877 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
878 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100879__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
880{
881 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
882
883 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100884 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Usama Arife73686a2019-04-08 17:30:48 +0100885 const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100886
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100887 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
888 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100889 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100890
Giorgio Arenad051e972018-06-20 11:46:42 +0100891 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100892 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
893
Usama Arife73686a2019-04-08 17:30:48 +0100894 for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100895 {
Usama Arife73686a2019-04-08 17:30:48 +0100896 for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100897 {
898 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
899 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000900 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100901 }
902 else
903 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100904 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100905 }
906 }
907 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100908#if defined(HAS_BIAS)
909 *output_ptr = (DATA_TYPE)(1);
910#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100911}
912
Giorgio Arena76572242018-04-04 17:44:26 +0100913#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100914
915#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
916
917/** This kernel performs a reshaping of the output of the depthwise generic convolution.
918 *
919 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
920 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
921 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100922 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100923 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
924 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
925 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
926 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
927 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
928 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
929 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
930 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
931 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
932 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
933 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
934 */
935__kernel void depthwise_vector_to_tensor(
936 VECTOR_DECLARATION(src),
937 TENSOR3D_DECLARATION(dst))
938{
939 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
940
941 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
942 const int id0 = get_global_id(0);
943 const int z = id0 / patch_size;
944 const int index2D = id0 - z * patch_size;
945
Giorgio Arenad051e972018-06-20 11:46:42 +0100946 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100947 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
948}
949
950#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000951
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100952#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000953#if defined(CONV_STRIDE_X)
954#if CONV_STRIDE_X == 1
955#define convolution1x3_f16 convolution1x3_stride_1_f16
956#elif CONV_STRIDE_X == 2
957#define convolution1x3_f16 convolution1x3_stride_2_f16
958#elif CONV_STRIDE_X == 3
959#define convolution1x3_f16 convolution1x3_stride_3_f16
960#else /* CONV_STRIDE_X */
961#error "Stride not supported"
962#endif /* CONV_STRIDE_X */
963
Manuel Bottinia788c2f2019-04-08 13:18:00 +0100964#if(DILATION_X > 1 || DILATION_Y > 1)
965
966/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16
967 *
968 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
969 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
970 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
971 * @param[in] y_offset Offset from the source tensor from which to start convolution
972 * @param[in] weights_addr Pointer from where to get weights
973 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
974 */
975inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
976 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
977{
978 // Load the weights
979 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
980 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
981 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
982
983 half4 pixels0 = 0.0f;
984
985 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
986 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
987 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
988
989 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
990 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
991 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
992
993 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
994 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
995 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
996
997 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
998 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
999 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1000
1001 return pixels0;
1002}
1003
1004/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16
1005 *
1006 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
1007 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1008 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1009 * @param[in] y_offset Offset from the source tensor from which to start convolution
1010 * @param[in] weights_addr Pointer from where to get weights
1011 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
1012 */
1013inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
1014 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
1015{
1016 // Load the weights
1017 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1018 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1019 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1020
1021 half4 pixels0 = 0.0f;
1022
1023 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
1024 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1025 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
1026
1027 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
1028 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1029 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
1030
1031 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
1032 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1033 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
1034
1035 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
1036 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
1037 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
1038
1039 return pixels0;
1040}
1041
1042#endif // (DILATION_X > 1 && DILATION_Y > 1)
1043
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001044/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
1045 *
1046 * @param[in] left_pixel Pointer to the left pixel.
1047 * @param[in] left_coeff Weight of the left pixel
1048 * @param[in] middle_coeff Weight of the middle pixel
1049 * @param[in] right_coeff Weight of the right pixel
1050 *
1051 * @return a half4 containing 4 convoluted values.
1052 */
1053inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
1054 const half left_coeff,
1055 const half middle_coeff,
1056 const half right_coeff)
1057{
Usama Arife73686a2019-04-08 17:30:48 +01001058#if(DILATION_X == 1 && DILATION_Y == 1)
1059
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001060 half8 temp = vload8(0, (__global half *)left_pixel);
1061
1062 half4 left = CONVERT(temp.s0123, half4);
1063 half4 middle = CONVERT(temp.s1234, half4);
1064 half4 right = CONVERT(temp.s2345, half4);
1065
1066 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001067#else /* DILATION_X==1 && DILATION_Y==1 */
1068 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
1069 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
1070 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
1071
1072#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001073}
1074
1075/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
1076 *
1077 * @param[in] left_pixel Pointer to the left pixel.
1078 * @param[in] left_coeff Weight of the left pixel
1079 * @param[in] middle_coeff Weight of the middle pixel
1080 * @param[in] right_coeff Weight of the right pixel
1081 *
1082 * @return a half4 containing 4 convoluted values.
1083 */
1084inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
1085 const half left_coeff,
1086 const half middle_coeff,
1087 const half right_coeff)
1088{
Usama Arife73686a2019-04-08 17:30:48 +01001089#if(DILATION_X == 1 && DILATION_Y == 1)
1090
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001091 half8 temp0 = vload8(0, (__global half *)left_pixel);
1092 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
1093
1094 half4 left = CONVERT(temp0.s0246, half4);
1095 half4 middle = CONVERT(temp0.s1357, half4);
1096 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
1097
1098 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001099#else /* DILATION_X==1 && DILATION_Y==1 */
1100
1101 __global half *left_pixel_float = (__global half *)left_pixel;
1102
1103 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
1104 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
1105 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
1106
1107#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001108}
1109
1110/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
1111 *
1112 * @param[in] left_pixel Pointer to the left pixel.
1113 * @param[in] left_coeff Weight of the left pixel
1114 * @param[in] middle_coeff Weight of the middle pixel
1115 * @param[in] right_coeff Weight of the right pixel
1116 *
1117 * @return a half4 containing 4 convoluted values.
1118 */
1119inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
1120 const half left_coeff,
1121 const half middle_coeff,
1122 const half right_coeff)
1123{
Usama Arife73686a2019-04-08 17:30:48 +01001124#if(DILATION_X == 1 && DILATION_Y == 1)
1125
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001126 half16 temp0 = vload16(0, (__global half *)left_pixel);
1127
1128 half4 left = CONVERT(temp0.s0369, half4);
1129 half4 middle = CONVERT(temp0.s147A, half4);
1130 half4 right = CONVERT(temp0.s258B, half4);
1131
1132 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001133#else /* DILATION_X==1 && DILATION_Y==1 */
1134
1135 __global half *left_pixel_float = (__global half *)left_pixel;
1136
1137 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
1138 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
1139 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
1140
1141#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001142}
1143
1144/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
1145 *
1146 * Convolution matrix layout:
1147 *
1148 * [ mat0, mat1, mat2 ]\n
1149 * [ mat3, mat4, mat5 ]\n
1150 * [ mat6, mat7, mat8 ]\n
1151 *
1152 * @param[in] src A pointer to source Image structure
1153 * @param[in] mat0 Coefficient from the convolution matrix
1154 * @param[in] mat1 Coefficient from the convolution matrix
1155 * @param[in] mat2 Coefficient from the convolution matrix
1156 * @param[in] mat3 Coefficient from the convolution matrix
1157 * @param[in] mat4 Coefficient from the convolution matrix
1158 * @param[in] mat5 Coefficient from the convolution matrix
1159 * @param[in] mat6 Coefficient from the convolution matrix
1160 * @param[in] mat0 Coefficient from the convolution matrix
1161 * @param[in] mat7 Coefficient from the convolution matrix
1162 * @param[in] mat8 Coefficient from the convolution matrix
1163 *
1164 * @return a half4 containing 4 convoluted values.
1165 */
1166inline half4 convolution3x3_f16(
1167 Image *src,
1168 const half mat0, const half mat1, const half mat2,
1169 const half mat3, const half mat4, const half mat5,
1170 const half mat6, const half mat7, const half mat8)
1171{
1172 half4 pixels;
1173
1174 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001175 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1176 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001177
1178 return pixels;
1179}
1180
Giorgio Arena76572242018-04-04 17:44:26 +01001181#if defined(DEPTH_MULTIPLIER)
1182
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001183/** This OpenCL kernel computes the depthwise convolution 3x3
1184 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001185 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
1186 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1187 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1188 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1189 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
1190 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001191 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1192 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001193 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001194 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001195 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001196 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1197 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001198 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001199 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001200 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1201 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1202 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1203 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1204 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1205 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1206 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001207 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001208 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1209 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1210 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1211 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1212 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1213 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1214 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001215 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001216 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1217 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1218 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1219 */
1220__kernel void depthwise_convolution_3x3_f16(
1221 TENSOR3D_DECLARATION(src),
1222 TENSOR3D_DECLARATION(dst),
1223 TENSOR3D_DECLARATION(weights)
1224#if defined(HAS_BIAS)
1225 ,
1226 VECTOR_DECLARATION(biases)
1227#endif //defined(HAS_BIAS)
1228)
1229{
1230 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1231 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001232 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001233#if defined(HAS_BIAS)
1234 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1235#endif //defined(HAS_BIAS)
1236
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001237 // Extract channel and linearized batch indices
1238 const int channel = get_global_id(2) % DST_CHANNELS;
1239 const int batch = get_global_id(2) / DST_CHANNELS;
1240 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1241 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1242 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001243
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001244 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001245 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1246 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1247 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001248
1249 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1250 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1251 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1252#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001253 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001254#endif //defined(HAS_BIAS)
1255
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001256 vstore4(ACTIVATION_FUNC(pixels), 0, (__global half *)dst.ptr);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001257}
Giorgio Arena76572242018-04-04 17:44:26 +01001258#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001259#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001260
1261/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1262 * when both stride_x and stride_y are equal to 1
1263 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001264 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
1265 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1266 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1267 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1268 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
1269 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001270 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1271 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001272 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001273 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001274 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001275 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1276 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001277 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001278 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1279 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1280 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1281 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1282 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1283 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1284 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1285 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1286 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1287 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1288 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1289 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1290 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1291 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1292 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1293 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1294 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1295 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1296 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1297 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1298 */
1299__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1300 TENSOR3D_DECLARATION(src),
1301 TENSOR3D_DECLARATION(dst),
1302 TENSOR3D_DECLARATION(weights)
1303#if defined(HAS_BIAS)
1304 ,
1305 VECTOR_DECLARATION(biases)
1306#endif //defined(HAS_BIAS)
1307)
1308{
1309 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1310 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001311 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1312
1313 // Extract channel and linearized batch indices
1314 const int channel = get_global_id(2) % DST_CHANNELS;
1315 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001316
1317#ifdef HAS_BIAS
1318 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1319
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001320 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001321#endif /* defined(HAS_BIAS) */
1322
1323 half4 pixels0 = 0.0f;
1324 half4 pixels1 = 0.0f;
1325 half4 pixels2 = 0.0f;
1326 half4 pixels3 = 0.0f;
1327
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001328 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1329 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1330 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001331
Usama Arife73686a2019-04-08 17:30:48 +01001332#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001333 // Load the weights
1334 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1335 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1336 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1337
1338 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1339 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1340 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1341 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1342 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1343 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1344 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1345
1346 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1347 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1348 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1349 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1350 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1351 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1352 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1353 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1354 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1355 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1356 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1357 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1358
Usama Arife73686a2019-04-08 17:30:48 +01001359#else /* DILATION_X==1 && DILATION_Y==1 */
1360
1361 //3x3 Convolution of elements starting in 0th row
1362 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1363 //3x3 Convolution of elements starting in 1st row
1364 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1365 //3x3 Convolution of elements starting in 2nd row
1366 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1367 //3x3 Convolution of elements starting in 3rd row
1368 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1369
1370#endif /* DILATION_X==1 && DILATION_Y==1 */
1371
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001372#ifdef HAS_BIAS
1373 pixels0 += (half4)bias;
1374 pixels1 += (half4)bias;
1375 pixels2 += (half4)bias;
1376 pixels3 += (half4)bias;
1377#endif /* defined(HAS_BIAS) */
1378
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001379 vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1380 vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1381 vstore4(ACTIVATION_FUNC(pixels2), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1382 vstore4(ACTIVATION_FUNC(pixels3), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001383}
1384
1385/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1386 * when both stride_x and stride_y are equal to 2
1387 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001388 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
1389 * @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.
1390 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1391 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1392 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
1393 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001394 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1395 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001396 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001397 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001398 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001399 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001400 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1401 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001402 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1403 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1404 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1405 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1406 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1407 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1408 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1409 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1410 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1411 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1412 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1413 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1414 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1415 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1416 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1417 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1418 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1419 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1420 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1421 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1422 */
1423__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1424 TENSOR3D_DECLARATION(src),
1425 TENSOR3D_DECLARATION(dst),
1426 TENSOR3D_DECLARATION(weights)
1427#if defined(HAS_BIAS)
1428 ,
1429 VECTOR_DECLARATION(biases)
1430#endif //defined(HAS_BIAS)
1431)
1432{
1433 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1434 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001435 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1436
1437 // Extract channel and linearized batch indices
1438 const int channel = get_global_id(2) % DST_CHANNELS;
1439 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001440
1441#ifdef HAS_BIAS
1442 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1443
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001444 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001445#endif /* defined(HAS_BIAS) */
1446
1447 half4 pixels0 = 0.0f;
1448 half4 pixels1 = 0.0f;
1449
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001450 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1451 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1452 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001453
Usama Arife73686a2019-04-08 17:30:48 +01001454#if(DILATION_X == 1 && DILATION_Y == 1)
1455
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001456 // Load the weights
1457 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1458 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1459 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1460
1461 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1462 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1463 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1464 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1465 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1466 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1467 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1468 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1469 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1470 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1471 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1472
1473 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1474 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1475 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1476 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1477 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1478 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1479
Usama Arife73686a2019-04-08 17:30:48 +01001480#else /* DILATION_X==1 && DILATION_Y==1 */
1481 //3x3 Convolution of elements starting in 0th row
1482 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1483 //3x3 Convolution of elements starting in 2nd row
1484 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1485#endif /* DILATION_X==1 && DILATION_Y==1 */
1486
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001487#ifdef HAS_BIAS
1488 pixels0 += (half4)bias;
1489 pixels1 += (half4)bias;
1490#endif /* defined(HAS_BIAS) */
1491
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001492 vstore4(ACTIVATION_FUNC(pixels0), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1493 vstore4(ACTIVATION_FUNC(pixels1), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001494}
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001495#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)
Giorgio Arenad051e972018-06-20 11:46:42 +01001496
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001497#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001498
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001499#if DATA_TYPE != float || DATA_TYPE != half
1500#error "Unsupported data type"
1501#endif // DATA_TYPE != float || DATA_TYPE != half
1502
1503#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001504
1505#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1506/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1507 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001508 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001509 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1510 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1511 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1512 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1513 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1514 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001515 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
1516 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1517 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1518 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
Giorgio Arenad051e972018-06-20 11:46:42 +01001519 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001520 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001521 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001522 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001523 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001524 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001525 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001526 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1527 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1528 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1529 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001530 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1531 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1532 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1533 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1534 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1535 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1536 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001537 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1538 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001539 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001540 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001541 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1542 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1543 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1544 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1545 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1546 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1547 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1548 * @param[in] max_offset Max offset for the input tensor
1549 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1550 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1551 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1552 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1553 */
1554__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001555 TENSOR4D_DECLARATION(src),
1556 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001557 TENSOR3D_DECLARATION(weights),
1558#if defined(HAS_BIAS)
1559 VECTOR_DECLARATION(biases),
1560#endif /* defined(HAS_BIAS) */
1561 int max_offset)
1562{
1563 int x = get_global_id(0); // channels
1564 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001565#if defined(DST_DEPTH)
1566 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1567 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001568#else // defined(DST_DEPTH)
1569 int z = get_global_id(2); // spatial coordinate y
1570#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001571
1572 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1573
Georgios Pinitas37044642018-10-30 14:53:25 +00001574#if defined(DST_DEPTH)
1575 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1576#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001577 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001578#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001579
1580 int z_coord = 0;
1581 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001582 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001583
1584 // We compute 2x1x1 [C,W,H] elements
1585 VEC_FLOAT acc = 0;
1586
1587 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001588 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1589 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1590 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1591 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1592 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1593 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1594 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1595 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1596 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001597
1598 // Load input values
1599 // z == 0
1600 // Clamp z_coord as for z = 0, it can be negative
1601 // z_coord is casted to unsigned int in order to use just a min() operation
1602 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1603 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1604 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1605 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001606 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001607
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001608 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1609 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1610 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001611
1612 // z == 1
1613 // z_coord can be only negative for z = 0 so we do not need to clamp it
1614 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001615 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001616 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001617 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1618 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1619 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001620
1621 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001622 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1623 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1624 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001625 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001626 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1627 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1628 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001629
1630 acc = fma(values0, w0, acc);
1631 acc = fma(values1, w1, acc);
1632 acc = fma(values2, w2, acc);
1633
1634 acc = fma(values3, w3, acc);
1635 acc = fma(values4, w4, acc);
1636 acc = fma(values5, w5, acc);
1637
1638 acc = fma(values6, w6, acc);
1639 acc = fma(values7, w7, acc);
1640 acc = fma(values8, w8, acc);
1641
1642#if defined(HAS_BIAS)
1643 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001644 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001645 acc += bias_values;
1646#endif // defined(HAS_BIAS)
1647
Georgios Pinitas37044642018-10-30 14:53:25 +00001648#if defined(DST_DEPTH)
1649 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1650#else /* defined(DST_DEPTH) */
1651 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1652#endif /* defined(DST_DEPTH) */
1653
Giorgio Arenad051e972018-06-20 11:46:42 +01001654 VSTORE(VEC_SIZE)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001655 (ACTIVATION_FUNC(acc), 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001656}
1657#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1658
1659#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1660/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1661 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001662 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001663 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1664 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1665 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1666 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1667 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1668 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001669 * @note It is possible to select the activation function to apply using -DFUSED_ACTIVATION e.g. -DFUSED_ACTIVATION=relu
1670 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1671 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size
1672 * @note Select data type should be given too with -DSELECT_DATA_TYPE e.g -DSELECT_DATA_TYPE=half
Giorgio Arenad051e972018-06-20 11:46:42 +01001673 *
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001674 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Georgios Pinitas37044642018-10-30 14:53:25 +00001675 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001676 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001677 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001678 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001679 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001680 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1681 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1682 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1683 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001684 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1685 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1686 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1687 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1688 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1689 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1690 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001691 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1692 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001693 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001694 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32
Giorgio Arenad051e972018-06-20 11:46:42 +01001695 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1696 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1697 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1698 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1699 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1700 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1701 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1702 * @param[in] max_offset Max offset for the input tensor
1703 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1704 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1705 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1706 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1707 */
1708__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001709 TENSOR4D_DECLARATION(src),
1710 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001711 TENSOR3D_DECLARATION(weights),
1712#if defined(HAS_BIAS)
1713 VECTOR_DECLARATION(biases),
1714#endif /* defined(HAS_BIAS) */
1715 int max_offset)
1716{
1717 int x = get_global_id(0); // channels
1718 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001719#if defined(DST_DEPTH)
1720 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1721 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001722#else // defined(DST_DEPTH)
1723 int z = get_global_id(2); // spatial coordinate y
1724#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001725
1726 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1727
Georgios Pinitas37044642018-10-30 14:53:25 +00001728#if defined(DST_DEPTH)
1729 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1730#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001731 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001732#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001733
1734 int z_coord = 0;
1735 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001736 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001737
1738 // We compute 2x2x2 [C,W,H] elements
1739 VEC_FLOAT acc0 = 0;
1740 VEC_FLOAT acc1 = 0;
1741 VEC_FLOAT acc2 = 0;
1742 VEC_FLOAT acc3 = 0;
1743
1744 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001745 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1746 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1747 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1748 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1749 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1750 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1751 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1752 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1753 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001754
1755 // Load input values
1756 // z == 0
1757 // Clamp z_coord as for z = 0, it can be negative
1758 // z_coord is casted to unsigned int in order to use just a min() operation
1759 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001760 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001761 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1762 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001763 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001764
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001765 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1766 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1767 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1768 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001769
1770 // z == 1
1771 // z_coord can be only negative for z = 0 so we do not need to clamp it
1772 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001773 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001774 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001775 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1776 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1777 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1778 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001779
1780 // z == 2
1781 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1782 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1783 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001784 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001785 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1786 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1787 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1788 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001789
1790 // z == 3
1791 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1792 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001793 offset += (int4)src_stride_z;
1794 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001795 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1796 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1797 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1798 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001799
1800 acc0 = fma(values0, w0, acc0);
1801 acc0 = fma(values1, w1, acc0);
1802 acc0 = fma(values2, w2, acc0);
1803 acc1 = fma(values1, w0, acc1);
1804 acc1 = fma(values2, w1, acc1);
1805 acc1 = fma(values3, w2, acc1);
1806
1807 acc0 = fma(values4, w3, acc0);
1808 acc0 = fma(values5, w4, acc0);
1809 acc0 = fma(values6, w5, acc0);
1810 acc1 = fma(values5, w3, acc1);
1811 acc1 = fma(values6, w4, acc1);
1812 acc1 = fma(values7, w5, acc1);
1813
1814 acc0 = fma(values8, w6, acc0);
1815 acc0 = fma(values9, w7, acc0);
1816 acc0 = fma(values10, w8, acc0);
1817 acc1 = fma(values9, w6, acc1);
1818 acc1 = fma(values10, w7, acc1);
1819 acc1 = fma(values11, w8, acc1);
1820
1821 acc2 = fma(values4, w0, acc2);
1822 acc2 = fma(values5, w1, acc2);
1823 acc2 = fma(values6, w2, acc2);
1824 acc3 = fma(values5, w0, acc3);
1825 acc3 = fma(values6, w1, acc3);
1826 acc3 = fma(values7, w2, acc3);
1827
1828 acc2 = fma(values8, w3, acc2);
1829 acc2 = fma(values9, w4, acc2);
1830 acc2 = fma(values10, w5, acc2);
1831 acc3 = fma(values9, w3, acc3);
1832 acc3 = fma(values10, w4, acc3);
1833 acc3 = fma(values11, w5, acc3);
1834
1835 acc2 = fma(values12, w6, acc2);
1836 acc2 = fma(values13, w7, acc2);
1837 acc2 = fma(values14, w8, acc2);
1838 acc3 = fma(values13, w6, acc3);
1839 acc3 = fma(values14, w7, acc3);
1840 acc3 = fma(values15, w8, acc3);
1841
1842#if defined(HAS_BIAS)
1843 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1844
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001845 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001846
1847 acc0 += bias_values;
1848 acc1 += bias_values;
1849 acc2 += bias_values;
1850 acc3 += bias_values;
1851#endif // defined(HAS_BIAS)
1852
Georgios Pinitas37044642018-10-30 14:53:25 +00001853#if defined(DST_DEPTH)
1854 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1855#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001856 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001857#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001858
1859 VSTORE(VEC_SIZE)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001860 (ACTIVATION_FUNC(acc0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001861 VSTORE(VEC_SIZE)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001862 (ACTIVATION_FUNC(acc1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001863
1864#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1865 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1866#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1867 {
1868 VSTORE(VEC_SIZE)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001869 (ACTIVATION_FUNC(acc2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001870 VSTORE(VEC_SIZE)
Manuel Bottinia788c2f2019-04-08 13:18:00 +01001871 (ACTIVATION_FUNC(acc3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001872 }
1873}
1874
1875#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
giuros016d109962019-01-07 17:47:19 +00001876#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)