blob: 8ee0185fe68b6f3ef76884c29a006e3b8a352a1e [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Georgios Pinitase55b40a2018-09-13 17:20:04 +010027#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena9fe41442017-08-23 16:36:24 +010028#if defined(CONV_STRIDE_X)
29
Giorgio Arena93a690e2017-08-01 16:09:33 +010030#if CONV_STRIDE_X == 1
31#define convolution1x3 convolution1x3_stride_1
32#elif CONV_STRIDE_X == 2
33#define convolution1x3 convolution1x3_stride_2
34#elif CONV_STRIDE_X == 3
35#define convolution1x3 convolution1x3_stride_3
36#else /* CONV_STRIDE_X */
37#error "Stride not supported"
38#endif /* CONV_STRIDE_X */
39
40/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
41 *
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
46 *
47 * @return a float2 containing 2 convoluted values.
48 */
49inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
53{
Usama Arife73686a2019-04-08 17:30:48 +010054#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +010055 float4 temp = vload4(0, (__global float *)left_pixel);
56
57 float2 left = CONVERT(temp.s01, float2);
58 float2 middle = CONVERT(temp.s12, float2);
59 float2 right = CONVERT(temp.s23, float2);
Giorgio Arena93a690e2017-08-01 16:09:33 +010060 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +010061#else /* DILATION_X==1 && DILATION_Y==1 */
62 return vload2(0, (__global float *)left_pixel) * (float2)left_coeff
63 + vload2(0, (__global float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
64 + vload2(0, (__global float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
65#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010066}
67
68/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
69 *
70 * @param[in] left_pixel Pointer to the left pixel.
71 * @param[in] left_coeff Weight of the left pixel
72 * @param[in] middle_coeff Weight of the middle pixel
73 * @param[in] right_coeff Weight of the right pixel
74 *
75 * @return a float2 containing 2 convoluted values.
76 */
77inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
78 const float left_coeff,
79 const float middle_coeff,
80 const float right_coeff)
81{
Usama Arife73686a2019-04-08 17:30:48 +010082#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +010083 float4 temp0 = vload4(0, (__global float *)left_pixel);
84 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
85
86 float2 left = CONVERT(temp0.s02, float2);
87 float2 middle = CONVERT(temp0.s13, float2);
88 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
89
90 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +010091#else /* DILATION_X==1 && DILATION_Y==1 */
92 __global float *left_pixel_float = (__global float *)left_pixel;
93
94 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
95 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
96 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
97
98#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +010099}
100
101/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
102 *
103 * @param[in] left_pixel Pointer to the left pixel.
104 * @param[in] left_coeff Weight of the left pixel
105 * @param[in] middle_coeff Weight of the middle pixel
106 * @param[in] right_coeff Weight of the right pixel
107 *
108 * @return a float2 containing 2 convoluted values.
109 */
110inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
111 const float left_coeff,
112 const float middle_coeff,
113 const float right_coeff)
114{
Usama Arife73686a2019-04-08 17:30:48 +0100115#if(DILATION_X == 1 && DILATION_Y == 1)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100116 float4 temp0 = vload4(0, (__global float *)left_pixel);
117 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
118
119 float2 left = CONVERT(temp0.s03, float2);
120 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
121 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
122
123 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +0100124#else /* DILATION_X==1 && DILATION_Y==1 */
125 __global float *left_pixel_float = (__global float *)left_pixel;
126
127 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
128 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
129 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
130#endif /* DILATION_X==1 && DILATION_Y==1 */
Giorgio Arena93a690e2017-08-01 16:09:33 +0100131}
132
133/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
134 *
135 * Convolution matrix layout:
136 *
137 * [ mat0, mat1, mat2 ]\n
138 * [ mat3, mat4, mat5 ]\n
139 * [ mat6, mat7, mat8 ]\n
140 *
141 * @param[in] src A pointer to source Image structure
142 * @param[in] mat0 Coefficient from the convolution matrix
143 * @param[in] mat1 Coefficient from the convolution matrix
144 * @param[in] mat2 Coefficient from the convolution matrix
145 * @param[in] mat3 Coefficient from the convolution matrix
146 * @param[in] mat4 Coefficient from the convolution matrix
147 * @param[in] mat5 Coefficient from the convolution matrix
148 * @param[in] mat6 Coefficient from the convolution matrix
149 * @param[in] mat0 Coefficient from the convolution matrix
150 * @param[in] mat7 Coefficient from the convolution matrix
151 * @param[in] mat8 Coefficient from the convolution matrix
152 *
153 * @return a float2 containing 2 convoluted values.
154 */
155inline float2 convolution3x3(
156 Image *src,
157 const float mat0, const float mat1, const float mat2,
158 const float mat3, const float mat4, const float mat5,
159 const float mat6, const float mat7, const float mat8)
160{
161 float2 pixels;
162
163 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +0100164 pixels += convolution1x3(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
165 pixels += convolution1x3(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Giorgio Arena93a690e2017-08-01 16:09:33 +0100166
167 return pixels;
168}
169
Gian Marcoc799ed82018-02-01 16:57:48 +0000170/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000171 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000172 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
173 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000174 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000175 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000176 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000177 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000178 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
179 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000180 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000181 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
182 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
183 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
184 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
185 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
186 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
187 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000188 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000189 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
190 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
191 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
192 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
193 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
194 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
195 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
196 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
197 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
198 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
199 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
200 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100201__kernel void depthwise_convolution_3x3(
202 TENSOR3D_DECLARATION(src),
203 TENSOR3D_DECLARATION(dst),
204 TENSOR3D_DECLARATION(weights)
205#if defined(HAS_BIAS)
206 ,
207 VECTOR_DECLARATION(biases)
208#endif //defined(HAS_BIAS)
209)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100210{
211 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
212 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100213 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100214#if defined(HAS_BIAS)
215 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
216#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100217
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100218 // Extract channel and linearized batch indices
219 const int channel = get_global_id(2) % DST_CHANNELS;
220 const int batch = get_global_id(2) / DST_CHANNELS;
221 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
222 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
223 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100224
Giorgio Arena93a690e2017-08-01 16:09:33 +0100225 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100226 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
227 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
228 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
Giorgio Arena93a690e2017-08-01 16:09:33 +0100229
230 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
231 weights_values1.s0, weights_values1.s1, weights_values1.s2,
232 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100233#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100234 pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100235#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100236
237 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100238}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100239#endif //defined(CONV_STRIDE_X)
240
Usama Arife73686a2019-04-08 17:30:48 +0100241#if(DILATION_X == 1 && DILATION_Y == 1)
242
Gian Marcoc799ed82018-02-01 16:57:48 +0000243#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
244 ({ \
245 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
246 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
247 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
248 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
249 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
250 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
251 })
252
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000253#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
254 ({ \
255 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
256 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
257 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
258 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
259 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
260 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
261 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
262 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
263 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
264 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
265 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
266 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
267 })
268
Gian Marcoc799ed82018-02-01 16:57:48 +0000269#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
270 ({ \
271 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
272 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
273 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
274 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
275 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
276 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
277 })
278
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000279#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
280 ({ \
281 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
282 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
283 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
284 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
285 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
286 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
287 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
288 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
289 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
290 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
291 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
292 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
293 })
294
Usama Arife73686a2019-04-08 17:30:48 +0100295#else /* DILATION_X==1 && DILATION_Y==1 */
296
297#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
298 ({ \
299 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
300 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
301 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
302 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
303 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
304 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
305 })
306
307#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
308 ({ \
309 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
310 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
311 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
312 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
313 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
314 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
315 })
316
317#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \
318 ({ \
319 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
320 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
321 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
322 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \
323 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \
324 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \
325 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \
326 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \
327 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \
328 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \
329 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \
330 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \
331 })
332
333#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \
334 ({ \
335 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \
336 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \
337 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \
338 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \
339 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \
340 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \
341 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \
342 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \
343 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \
344 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \
345 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \
346 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \
347 })
348
349/** Get the pointer position at a certain offset in x and y direction.
350 *
351 * @param[in] ptr Pointer to the starting position of the buffer
352 * @param[in] x Relative X position
353 * @param[in] y Relative Y position
354 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
355 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
356 */
357inline __global uchar *ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
358{
359 return ptr + x * stride_x + y * stride_y;
360}
361
362/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 and DILATION_Y>1 for F32
363 *
364 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
365 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
366 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
367 * @param[in] y_offset Offset from the source tensor from which to start convolution
368 * @param[in] weights_addr Pointer from where to get weights
369 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
370 */
371inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
372 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
373{
374 // Load the weights
375 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
376 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
377 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
378
379 float2 pixels0 = 0.0f;
380
381 float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
382 float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
383 float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
384
385 float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
386 float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
387 float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
388
389 float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
390 float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
391 float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
392
393 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
394 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
395 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
396
397 return pixels0;
398}
399
400/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 and DILATION_Y>1 for F32
401 *
402 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
403 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
404 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
405 * @param[in] y_offset Offset from the source tensor from which to start convolution
406 * @param[in] weights_addr Pointer from where to get weights
407 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
408 */
409inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
410 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
411{
412 // Load the weights
413 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
414 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
415 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
416
417 float2 pixels0 = 0.0f;
418
419 float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
420 float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
421 float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
422
423 float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
424 float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
425 float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
426
427 float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
428 float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
429 float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
430
431 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
432 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
433 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
434
435 return pixels0;
436}
437
438/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 and DILATION_Y>1 for f16
439 *
440 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
441 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
442 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
443 * @param[in] y_offset Offset from the source tensor from which to start convolution
444 * @param[in] weights_addr Pointer from where to get weights
445 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
446 */
447inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
448 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
449{
450 // Load the weights
451 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
452 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
453 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
454
455 half4 pixels0 = 0.0f;
456
457 half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
458 half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
459 half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
460
461 half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
462 half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
463 half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
464
465 half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
466 half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
467 half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
468
469 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);
470 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);
471 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);
472
473 return pixels0;
474}
475
476/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 and DILATION_Y>1 for F16
477 *
478 * @param[in] src_addr Pointer to the starting position of where to perform the convolution
479 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
480 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
481 * @param[in] y_offset Offset from the source tensor from which to start convolution
482 * @param[in] weights_addr Pointer from where to get weights
483 * @param[in] weights_stride_y Stride of weights tesnsor in Y dimension
484 */
485inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,
486 const int y_offset, __global uchar *weights_addr, const int weights_stride_y)
487{
488 // Load the weights
489 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
490 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
491 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
492
493 half4 pixels0 = 0.0f;
494
495 half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0
496 half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
497 half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
498
499 half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1
500 half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
501 half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
502
503 half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2
504 half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
505 half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
506
507 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);
508 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);
509 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);
510
511 return pixels0;
512}
513
514#endif /* DILATION_X==1 && DILATION_Y==1 */
515
Gian Marcoc799ed82018-02-01 16:57:48 +0000516/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
517 * stride_x and stride_y are equal to 1
518 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000519 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
520 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000521 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000522 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000523 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000524 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000525 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
526 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
527 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
528 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
529 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
530 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
531 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
532 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
533 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
534 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
535 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
536 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
537 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
538 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
539 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
540 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
541 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
542 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
543 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
544 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
545 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
546 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
547 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000548__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000549 TENSOR3D_DECLARATION(src),
550 TENSOR3D_DECLARATION(dst),
551 TENSOR3D_DECLARATION(weights)
552#if defined(HAS_BIAS)
553 ,
554 VECTOR_DECLARATION(biases)
555#endif //defined(HAS_BIAS)
556)
557{
558 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
559 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100560 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000561
562 float2 pixels0 = 0.0f;
563 float2 pixels1 = 0.0f;
564 float2 pixels2 = 0.0f;
565 float2 pixels3 = 0.0f;
566
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100567 // Extract channel and linearized batch indices
568 const int channel = get_global_id(2) % DST_CHANNELS;
569 const int batch = get_global_id(2) / DST_CHANNELS;
570 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
571 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
572 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000573
Usama Arife73686a2019-04-08 17:30:48 +0100574#if(DILATION_X == 1 && DILATION_Y == 1)
Gian Marcoc799ed82018-02-01 16:57:48 +0000575 // Load the weights
576 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
577 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
578 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
579
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000580 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000581 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
582 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
583 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
584 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000585 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
586 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000587
588 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
589 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
590 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
591 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
592 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
593 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
594 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
595 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
596 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
597 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
598 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
599 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
600
Usama Arife73686a2019-04-08 17:30:48 +0100601#else /* DILATION_X==1 && DILATION_Y==1 */
602
603 //3x3 Convolution of elements starting in 0th row
604 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
605 //3x3 Convolution of elements starting in 1st row
606 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
607 //3x3 Convolution of elements starting in 2nd row
608 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
609 //3x3 Convolution of elements starting in 3rd row
610 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
611
612#endif /* DILATION_X==1 && DILATION_Y==1 */
613
Gian Marcoc799ed82018-02-01 16:57:48 +0000614#ifdef HAS_BIAS
615 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
616
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100617 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000618
619 pixels0 += (float2)bias;
620 pixels1 += (float2)bias;
621 pixels2 += (float2)bias;
622 pixels3 += (float2)bias;
623#endif /* defined(HAS_BIAS) */
624
625 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
626 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
627 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
628 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
629}
630
631/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
632 * stride_x and stride_y are equal to 2
633 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000634 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
635 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000636 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000637 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000638 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000639 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000640 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
641 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
642 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
643 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
644 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
645 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
646 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
647 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
648 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
649 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
650 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
651 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
652 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
653 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
654 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
655 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
656 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
657 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
658 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
659 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
660 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
662 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000663__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000664 TENSOR3D_DECLARATION(src),
665 TENSOR3D_DECLARATION(dst),
666 TENSOR3D_DECLARATION(weights)
667#if defined(HAS_BIAS)
668 ,
669 VECTOR_DECLARATION(biases)
670#endif //defined(HAS_BIAS)
671)
672{
673 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
674 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100675 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000676
677 float2 pixels0 = 0.0f;
678 float2 pixels1 = 0.0f;
679
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100680 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000681 const int channel = get_global_id(2) % DST_CHANNELS;
682 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100683 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
684 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
685 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000686
Usama Arife73686a2019-04-08 17:30:48 +0100687#if(DILATION_X == 1 && DILATION_Y == 1)
688
Gian Marcoc799ed82018-02-01 16:57:48 +0000689 // Load the weights
690 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
691 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
692 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
693
694 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
695 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
696 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
697 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
698 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
699 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
700 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
701 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
702 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
703 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
704 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
705
706 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
707 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
708 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
709 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
710 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
711 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
712
Usama Arife73686a2019-04-08 17:30:48 +0100713#else /* DILATION_X==1 && DILATION_Y==1 */
714
715 //3x3 Convolution of elements starting in 0th row
716 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
717 //3x3 Convolution of elements starting in 2nd row
718 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
719#endif /* DILATION_X==1 && DILATION_Y==1 */
720
Gian Marcoc799ed82018-02-01 16:57:48 +0000721#ifdef HAS_BIAS
722 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
723
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100724 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000725
726 pixels0 += (float2)bias;
727 pixels1 += (float2)bias;
728#endif /* defined(HAS_BIAS) */
729
730 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
731 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
732}
733
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100734#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena76572242018-04-04 17:44:26 +0100735
giuros016d109962019-01-07 17:47:19 +0000736#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
737/** Reshape the weights for quantized depthwise convolution
738 *
739 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8
740 * @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128
741 * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
742 * @attention Input's height and width should be 3
743 *
744 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
745 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
746 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
747 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
748 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
749 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
750 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
751 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
752 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
753 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
754 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
755 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
756 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
757 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
758 */
759__kernel void depthwise_convolution_reshape_weights(
760 TENSOR3D_DECLARATION(src),
761 IMAGE_DECLARATION(dst))
762{
763 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
764 const int x = get_global_id(0);
765
766 // Load 3x3xVEC_SIZE weights
767 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
768 w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);
769 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
770 w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);
771 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
772 w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);
773 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
774 w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);
775 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
776 w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);
777 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
778 w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);
779 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
780 w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);
781 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
782 w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);
783 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
784 w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);
785
786 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);
787
788#if defined(TRANSPOSE)
789#if VEC_SIZE != 4
790#error "VEC_SIZE not supported"
791#else // VEC_SIZE != 4
792 VSTORE(VEC_SIZE)
793 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);
794 VSTORE(VEC_SIZE)
795 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
796 VSTORE(VEC_SIZE)
797 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
798 VSTORE(VEC_SIZE)
799 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
800 VSTORE(VEC_SIZE)
801 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
802 VSTORE(VEC_SIZE)
803 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
804 VSTORE(VEC_SIZE)
805 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
806 VSTORE(VEC_SIZE)
807 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
808 VSTORE(VEC_SIZE)
809 ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
810#endif // VEC_SIZE != 4
811#else // !defined(TRANSPOSE)
812 VSTORE(VEC_SIZE)
813 (w0, 0, dst_addr + 0);
814 VSTORE(VEC_SIZE)
815 (w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);
816 VSTORE(VEC_SIZE)
817 (w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);
818 VSTORE(VEC_SIZE)
819 (w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);
820 VSTORE(VEC_SIZE)
821 (w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);
822 VSTORE(VEC_SIZE)
823 (w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);
824 VSTORE(VEC_SIZE)
825 (w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);
826 VSTORE(VEC_SIZE)
827 (w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);
828 VSTORE(VEC_SIZE)
829 (w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);
830#endif // defined(TRANSPOSE)
831}
832#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)
833
Giorgio Arenad051e972018-06-20 11:46:42 +0100834#if defined(NCHW)
835#define in_stride_x src_stride_x
836#define in_stride_y src_stride_y
837#define in_stride_z src_stride_z
838#define out_stride_x dst_stride_x
839#define out_stride_y dst_stride_y
840#define out_stride_z dst_stride_z
841#else //defined(NCHW)
842#define in_stride_x src_stride_y
843#define in_stride_y src_stride_z
844#define in_stride_z src_stride_x
845#define out_stride_x dst_stride_y
846#define out_stride_y dst_stride_z
847#define out_stride_z dst_stride_x
848#endif //defined(NCHW)
849
Giorgio Arena9fe41442017-08-23 16:36:24 +0100850#if defined(SRC_WIDTH) && defined(DATA_TYPE)
851/** This kernel reshapes each of the tensor's low three dimensions to single rows.
852 *
853 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
854 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100855 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
856 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
857 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
858 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
859 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
860 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
861 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
862 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
863 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
864 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
865 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
866 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
867 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
868 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
869 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
870 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
871 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
872 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100873 */
giuros016d109962019-01-07 17:47:19 +0000874__kernel void depthwise_convolution_reshape_weights_generic(
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100875 TENSOR3D_DECLARATION(src),
876 IMAGE_DECLARATION(dst)
877#ifdef HAS_BIAS
878 ,
879 VECTOR_DECLARATION(biases)
880#endif /* HAS_BIAS */
881)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100882{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100883#ifdef HAS_BIAS
884 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
885#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100886
Giorgio Arenad051e972018-06-20 11:46:42 +0100887 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
888 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100889
Giorgio Arenad051e972018-06-20 11:46:42 +0100890 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100891 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100892 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100893 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100894
895#if defined(HAS_BIAS)
896 if(get_global_id(1) == 0)
897 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100898 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100899 }
900#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100901}
902#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
903
Usama Arife73686a2019-04-08 17:30:48 +0100904#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100905/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
906 *
907 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100908 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Usama Arife73686a2019-04-08 17:30:48 +0100909 * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
Giorgio Arena9fe41442017-08-23 16:36:24 +0100910 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100911 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100912 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
913 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
914 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
915 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
916 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
917 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
918 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
919 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
920 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
921 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
922 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
923 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
924 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
925 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
926 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
927 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100928__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
929{
930 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
931
932 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100933 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Usama Arife73686a2019-04-08 17:30:48 +0100934 const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100935
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100936 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
937 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100938 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100939
Giorgio Arenad051e972018-06-20 11:46:42 +0100940 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100941 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
942
Usama Arife73686a2019-04-08 17:30:48 +0100943 for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100944 {
Usama Arife73686a2019-04-08 17:30:48 +0100945 for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100946 {
947 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
948 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000949 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100950 }
951 else
952 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100953 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100954 }
955 }
956 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100957#if defined(HAS_BIAS)
958 *output_ptr = (DATA_TYPE)(1);
959#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100960}
961
Giorgio Arena76572242018-04-04 17:44:26 +0100962#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100963
964#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
965
966/** This kernel performs a reshaping of the output of the depthwise generic convolution.
967 *
968 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
969 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
970 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100971 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100972 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
973 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
974 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
975 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
976 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
977 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
978 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
979 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
980 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
981 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
982 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
983 */
984__kernel void depthwise_vector_to_tensor(
985 VECTOR_DECLARATION(src),
986 TENSOR3D_DECLARATION(dst))
987{
988 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
989
990 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
991 const int id0 = get_global_id(0);
992 const int z = id0 / patch_size;
993 const int index2D = id0 - z * patch_size;
994
Giorgio Arenad051e972018-06-20 11:46:42 +0100995 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100996 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
997}
998
999#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001000
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001001#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001002#if defined(CONV_STRIDE_X)
1003#if CONV_STRIDE_X == 1
1004#define convolution1x3_f16 convolution1x3_stride_1_f16
1005#elif CONV_STRIDE_X == 2
1006#define convolution1x3_f16 convolution1x3_stride_2_f16
1007#elif CONV_STRIDE_X == 3
1008#define convolution1x3_f16 convolution1x3_stride_3_f16
1009#else /* CONV_STRIDE_X */
1010#error "Stride not supported"
1011#endif /* CONV_STRIDE_X */
1012
1013/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
1014 *
1015 * @param[in] left_pixel Pointer to the left pixel.
1016 * @param[in] left_coeff Weight of the left pixel
1017 * @param[in] middle_coeff Weight of the middle pixel
1018 * @param[in] right_coeff Weight of the right pixel
1019 *
1020 * @return a half4 containing 4 convoluted values.
1021 */
1022inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
1023 const half left_coeff,
1024 const half middle_coeff,
1025 const half right_coeff)
1026{
Usama Arife73686a2019-04-08 17:30:48 +01001027#if(DILATION_X == 1 && DILATION_Y == 1)
1028
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001029 half8 temp = vload8(0, (__global half *)left_pixel);
1030
1031 half4 left = CONVERT(temp.s0123, half4);
1032 half4 middle = CONVERT(temp.s1234, half4);
1033 half4 right = CONVERT(temp.s2345, half4);
1034
1035 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001036#else /* DILATION_X==1 && DILATION_Y==1 */
1037 return vload4(0, (__global half *)left_pixel) * (half4)left_coeff
1038 + vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
1039 + vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
1040
1041#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001042}
1043
1044/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
1045 *
1046 * @param[in] left_pixel Pointer to the left pixel.
1047 * @param[in] left_coeff Weight of the left pixel
1048 * @param[in] middle_coeff Weight of the middle pixel
1049 * @param[in] right_coeff Weight of the right pixel
1050 *
1051 * @return a half4 containing 4 convoluted values.
1052 */
1053inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
1054 const half left_coeff,
1055 const half middle_coeff,
1056 const half right_coeff)
1057{
Usama Arife73686a2019-04-08 17:30:48 +01001058#if(DILATION_X == 1 && DILATION_Y == 1)
1059
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001060 half8 temp0 = vload8(0, (__global half *)left_pixel);
1061 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
1062
1063 half4 left = CONVERT(temp0.s0246, half4);
1064 half4 middle = CONVERT(temp0.s1357, half4);
1065 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
1066
1067 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001068#else /* DILATION_X==1 && DILATION_Y==1 */
1069
1070 __global half *left_pixel_float = (__global half *)left_pixel;
1071
1072 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
1073 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
1074 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
1075
1076#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001077}
1078
1079/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
1080 *
1081 * @param[in] left_pixel Pointer to the left pixel.
1082 * @param[in] left_coeff Weight of the left pixel
1083 * @param[in] middle_coeff Weight of the middle pixel
1084 * @param[in] right_coeff Weight of the right pixel
1085 *
1086 * @return a half4 containing 4 convoluted values.
1087 */
1088inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
1089 const half left_coeff,
1090 const half middle_coeff,
1091 const half right_coeff)
1092{
Usama Arife73686a2019-04-08 17:30:48 +01001093#if(DILATION_X == 1 && DILATION_Y == 1)
1094
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001095 half16 temp0 = vload16(0, (__global half *)left_pixel);
1096
1097 half4 left = CONVERT(temp0.s0369, half4);
1098 half4 middle = CONVERT(temp0.s147A, half4);
1099 half4 right = CONVERT(temp0.s258B, half4);
1100
1101 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
Usama Arife73686a2019-04-08 17:30:48 +01001102#else /* DILATION_X==1 && DILATION_Y==1 */
1103
1104 __global half *left_pixel_float = (__global half *)left_pixel;
1105
1106 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
1107 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
1108 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
1109
1110#endif /* DILATION_X==1 && DILATION_Y==1 */
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001111}
1112
1113/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
1114 *
1115 * Convolution matrix layout:
1116 *
1117 * [ mat0, mat1, mat2 ]\n
1118 * [ mat3, mat4, mat5 ]\n
1119 * [ mat6, mat7, mat8 ]\n
1120 *
1121 * @param[in] src A pointer to source Image structure
1122 * @param[in] mat0 Coefficient from the convolution matrix
1123 * @param[in] mat1 Coefficient from the convolution matrix
1124 * @param[in] mat2 Coefficient from the convolution matrix
1125 * @param[in] mat3 Coefficient from the convolution matrix
1126 * @param[in] mat4 Coefficient from the convolution matrix
1127 * @param[in] mat5 Coefficient from the convolution matrix
1128 * @param[in] mat6 Coefficient from the convolution matrix
1129 * @param[in] mat0 Coefficient from the convolution matrix
1130 * @param[in] mat7 Coefficient from the convolution matrix
1131 * @param[in] mat8 Coefficient from the convolution matrix
1132 *
1133 * @return a half4 containing 4 convoluted values.
1134 */
1135inline half4 convolution3x3_f16(
1136 Image *src,
1137 const half mat0, const half mat1, const half mat2,
1138 const half mat3, const half mat4, const half mat5,
1139 const half mat6, const half mat7, const half mat8)
1140{
1141 half4 pixels;
1142
1143 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
Usama Arife73686a2019-04-08 17:30:48 +01001144 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);
1145 pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001146
1147 return pixels;
1148}
1149
Giorgio Arena76572242018-04-04 17:44:26 +01001150#if defined(DEPTH_MULTIPLIER)
1151
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001152/** This OpenCL kernel computes the depthwise convolution 3x3
1153 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001154 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1155 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001156 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001157 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001158 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001159 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1160 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001161 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001162 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001163 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1164 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1165 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1166 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1167 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1168 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1169 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001170 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001171 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1172 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1173 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1174 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1175 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1176 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1177 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1178 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
1179 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1180 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1181 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1182 */
1183__kernel void depthwise_convolution_3x3_f16(
1184 TENSOR3D_DECLARATION(src),
1185 TENSOR3D_DECLARATION(dst),
1186 TENSOR3D_DECLARATION(weights)
1187#if defined(HAS_BIAS)
1188 ,
1189 VECTOR_DECLARATION(biases)
1190#endif //defined(HAS_BIAS)
1191)
1192{
1193 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1194 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001195 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001196#if defined(HAS_BIAS)
1197 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1198#endif //defined(HAS_BIAS)
1199
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001200 // Extract channel and linearized batch indices
1201 const int channel = get_global_id(2) % DST_CHANNELS;
1202 const int batch = get_global_id(2) / DST_CHANNELS;
1203 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1204 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1205 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +01001206
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001207 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001208 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
1209 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
1210 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001211
1212 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1213 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1214 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1215#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001216 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001217#endif //defined(HAS_BIAS)
1218
1219 vstore4(pixels, 0, (__global half *)dst.ptr);
1220}
Giorgio Arena76572242018-04-04 17:44:26 +01001221#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +00001222#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001223
1224/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
1225 * when both stride_x and stride_y are equal to 1
1226 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001227 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1228 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001229 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001230 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001231 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001232 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1233 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001234 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001235 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1236 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1237 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1238 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1239 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1240 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1241 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1242 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1243 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1244 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1245 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1246 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1247 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1248 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1249 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1250 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1251 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1252 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1253 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1254 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1255 */
1256__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1257 TENSOR3D_DECLARATION(src),
1258 TENSOR3D_DECLARATION(dst),
1259 TENSOR3D_DECLARATION(weights)
1260#if defined(HAS_BIAS)
1261 ,
1262 VECTOR_DECLARATION(biases)
1263#endif //defined(HAS_BIAS)
1264)
1265{
1266 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1267 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001268 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1269
1270 // Extract channel and linearized batch indices
1271 const int channel = get_global_id(2) % DST_CHANNELS;
1272 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001273
1274#ifdef HAS_BIAS
1275 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1276
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001277 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001278#endif /* defined(HAS_BIAS) */
1279
1280 half4 pixels0 = 0.0f;
1281 half4 pixels1 = 0.0f;
1282 half4 pixels2 = 0.0f;
1283 half4 pixels3 = 0.0f;
1284
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001285 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1286 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1287 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001288
Usama Arife73686a2019-04-08 17:30:48 +01001289#if(DILATION_X == 1 && DILATION_Y == 1)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001290 // Load the weights
1291 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1292 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1293 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1294
1295 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
1296 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1297 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1298 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1299 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1300 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1301 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
1302
1303 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
1304 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
1305 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
1306 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
1307 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
1308 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
1309 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
1310 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
1311 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
1312 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
1313 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
1314 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
1315
Usama Arife73686a2019-04-08 17:30:48 +01001316#else /* DILATION_X==1 && DILATION_Y==1 */
1317
1318 //3x3 Convolution of elements starting in 0th row
1319 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1320 //3x3 Convolution of elements starting in 1st row
1321 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);
1322 //3x3 Convolution of elements starting in 2nd row
1323 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1324 //3x3 Convolution of elements starting in 3rd row
1325 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);
1326
1327#endif /* DILATION_X==1 && DILATION_Y==1 */
1328
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001329#ifdef HAS_BIAS
1330 pixels0 += (half4)bias;
1331 pixels1 += (half4)bias;
1332 pixels2 += (half4)bias;
1333 pixels3 += (half4)bias;
1334#endif /* defined(HAS_BIAS) */
1335
1336 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1337 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1338 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
1339 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
1340}
1341
1342/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
1343 * when both stride_x and stride_y are equal to 2
1344 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001345 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
1346 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001347 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001348 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001349 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001350 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001351 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1352 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001353 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
1354 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1355 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1356 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1357 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1358 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1359 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1360 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1361 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
1362 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1363 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1364 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1365 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1366 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1367 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1368 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
1369 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
1370 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1371 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1372 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1373 */
1374__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1375 TENSOR3D_DECLARATION(src),
1376 TENSOR3D_DECLARATION(dst),
1377 TENSOR3D_DECLARATION(weights)
1378#if defined(HAS_BIAS)
1379 ,
1380 VECTOR_DECLARATION(biases)
1381#endif //defined(HAS_BIAS)
1382)
1383{
1384 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
1385 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001386 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
1387
1388 // Extract channel and linearized batch indices
1389 const int channel = get_global_id(2) % DST_CHANNELS;
1390 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001391
1392#ifdef HAS_BIAS
1393 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
1394
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001395 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001396#endif /* defined(HAS_BIAS) */
1397
1398 half4 pixels0 = 0.0f;
1399 half4 pixels1 = 0.0f;
1400
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001401 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
1402 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1403 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001404
Usama Arife73686a2019-04-08 17:30:48 +01001405#if(DILATION_X == 1 && DILATION_Y == 1)
1406
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001407 // Load the weights
1408 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
1409 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
1410 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
1411
1412 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
1413 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1414 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1415 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1416 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1417 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1418 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1419 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1420 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1421 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1422 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1423
1424 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1425 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1426 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1427 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1428 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1429 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1430
Usama Arife73686a2019-04-08 17:30:48 +01001431#else /* DILATION_X==1 && DILATION_Y==1 */
1432 //3x3 Convolution of elements starting in 0th row
1433 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);
1434 //3x3 Convolution of elements starting in 2nd row
1435 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);
1436#endif /* DILATION_X==1 && DILATION_Y==1 */
1437
Michele Di Giorgio3ebef322018-02-21 10:02:58 +00001438#ifdef HAS_BIAS
1439 pixels0 += (half4)bias;
1440 pixels1 += (half4)bias;
1441#endif /* defined(HAS_BIAS) */
1442
1443 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1444 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1445}
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001446#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenad051e972018-06-20 11:46:42 +01001447
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001448#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001449
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001450#if DATA_TYPE != float || DATA_TYPE != half
1451#error "Unsupported data type"
1452#endif // DATA_TYPE != float || DATA_TYPE != half
1453
1454#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001455
1456#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1457/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1458 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001459 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001460 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1461 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1462 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1463 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1464 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1465 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1466 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001467 * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
1468 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001469 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001470 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001471 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001472 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001473 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1474 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1475 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1476 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001477 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1478 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1479 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1480 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1481 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1482 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1483 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001484 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1485 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001486 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1487 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1488 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1489 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1490 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1491 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1492 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1493 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1494 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1495 * @param[in] max_offset Max offset for the input tensor
1496 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1497 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1498 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1499 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1500 */
1501__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001502 TENSOR4D_DECLARATION(src),
1503 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001504 TENSOR3D_DECLARATION(weights),
1505#if defined(HAS_BIAS)
1506 VECTOR_DECLARATION(biases),
1507#endif /* defined(HAS_BIAS) */
1508 int max_offset)
1509{
1510 int x = get_global_id(0); // channels
1511 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001512#if defined(DST_DEPTH)
1513 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1514 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001515#else // defined(DST_DEPTH)
1516 int z = get_global_id(2); // spatial coordinate y
1517#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001518
1519 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1520
Georgios Pinitas37044642018-10-30 14:53:25 +00001521#if defined(DST_DEPTH)
1522 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1523#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001524 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001525#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001526
1527 int z_coord = 0;
1528 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +01001529 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001530
1531 // We compute 2x1x1 [C,W,H] elements
1532 VEC_FLOAT acc = 0;
1533
1534 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001535 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1536 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1537 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1538 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1539 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1540 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1541 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1542 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1543 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001544
1545 // Load input values
1546 // z == 0
1547 // Clamp z_coord as for z = 0, it can be negative
1548 // z_coord is casted to unsigned int in order to use just a min() operation
1549 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1550 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1551 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1552 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001553 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001554
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001555 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1556 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1557 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001558
1559 // z == 1
1560 // z_coord can be only negative for z = 0 so we do not need to clamp it
1561 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +01001562 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001563 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001564 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1565 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1566 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001567
1568 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +01001569 // Offset can be out-of-bound so we need to check if it is greater than max_offset
1570 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
1571 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001572 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001573 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1574 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1575 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001576
1577 acc = fma(values0, w0, acc);
1578 acc = fma(values1, w1, acc);
1579 acc = fma(values2, w2, acc);
1580
1581 acc = fma(values3, w3, acc);
1582 acc = fma(values4, w4, acc);
1583 acc = fma(values5, w5, acc);
1584
1585 acc = fma(values6, w6, acc);
1586 acc = fma(values7, w7, acc);
1587 acc = fma(values8, w8, acc);
1588
1589#if defined(HAS_BIAS)
1590 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001591 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001592 acc += bias_values;
1593#endif // defined(HAS_BIAS)
1594
Georgios Pinitas37044642018-10-30 14:53:25 +00001595#if defined(DST_DEPTH)
1596 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1597#else /* defined(DST_DEPTH) */
1598 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1599#endif /* defined(DST_DEPTH) */
1600
Giorgio Arenad051e972018-06-20 11:46:42 +01001601 VSTORE(VEC_SIZE)
Georgios Pinitas37044642018-10-30 14:53:25 +00001602 (acc, 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001603}
1604#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1605
1606#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1607/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1608 *
1609 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1610 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1611 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1612 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1613 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1614 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1615 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001616 * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
1617 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001618 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001619 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001620 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001621 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001622 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1623 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1624 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1625 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001626 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1627 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1628 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1629 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1630 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1631 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1632 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001633 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1634 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001635 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1636 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1637 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1638 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1639 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1640 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1641 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1642 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1643 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1644 * @param[in] max_offset Max offset for the input tensor
1645 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1646 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1647 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1648 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1649 */
1650__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001651 TENSOR4D_DECLARATION(src),
1652 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001653 TENSOR3D_DECLARATION(weights),
1654#if defined(HAS_BIAS)
1655 VECTOR_DECLARATION(biases),
1656#endif /* defined(HAS_BIAS) */
1657 int max_offset)
1658{
1659 int x = get_global_id(0); // channels
1660 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001661#if defined(DST_DEPTH)
1662 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1663 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Usama Arife73686a2019-04-08 17:30:48 +01001664#else // defined(DST_DEPTH)
1665 int z = get_global_id(2); // spatial coordinate y
1666#endif // defined(DST_DEPTH)
Giorgio Arenad051e972018-06-20 11:46:42 +01001667
1668 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1669
Georgios Pinitas37044642018-10-30 14:53:25 +00001670#if defined(DST_DEPTH)
1671 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1672#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001673 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001674#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001675
1676 int z_coord = 0;
1677 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001678 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001679
1680 // We compute 2x2x2 [C,W,H] elements
1681 VEC_FLOAT acc0 = 0;
1682 VEC_FLOAT acc1 = 0;
1683 VEC_FLOAT acc2 = 0;
1684 VEC_FLOAT acc3 = 0;
1685
1686 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001687 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1688 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1689 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1690 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1691 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1692 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1693 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1694 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1695 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001696
1697 // Load input values
1698 // z == 0
1699 // Clamp z_coord as for z = 0, it can be negative
1700 // z_coord is casted to unsigned int in order to use just a min() operation
1701 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001702 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001703 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1704 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001705 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001706
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001707 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1708 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1709 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1710 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001711
1712 // z == 1
1713 // z_coord can be only negative for z = 0 so we do not need to clamp it
1714 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001715 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001716 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001717 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1718 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1719 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1720 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001721
1722 // z == 2
1723 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1724 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1725 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001726 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001727 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1728 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1729 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1730 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001731
1732 // z == 3
1733 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1734 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001735 offset += (int4)src_stride_z;
1736 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001737 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1738 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1739 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1740 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001741
1742 acc0 = fma(values0, w0, acc0);
1743 acc0 = fma(values1, w1, acc0);
1744 acc0 = fma(values2, w2, acc0);
1745 acc1 = fma(values1, w0, acc1);
1746 acc1 = fma(values2, w1, acc1);
1747 acc1 = fma(values3, w2, acc1);
1748
1749 acc0 = fma(values4, w3, acc0);
1750 acc0 = fma(values5, w4, acc0);
1751 acc0 = fma(values6, w5, acc0);
1752 acc1 = fma(values5, w3, acc1);
1753 acc1 = fma(values6, w4, acc1);
1754 acc1 = fma(values7, w5, acc1);
1755
1756 acc0 = fma(values8, w6, acc0);
1757 acc0 = fma(values9, w7, acc0);
1758 acc0 = fma(values10, w8, acc0);
1759 acc1 = fma(values9, w6, acc1);
1760 acc1 = fma(values10, w7, acc1);
1761 acc1 = fma(values11, w8, acc1);
1762
1763 acc2 = fma(values4, w0, acc2);
1764 acc2 = fma(values5, w1, acc2);
1765 acc2 = fma(values6, w2, acc2);
1766 acc3 = fma(values5, w0, acc3);
1767 acc3 = fma(values6, w1, acc3);
1768 acc3 = fma(values7, w2, acc3);
1769
1770 acc2 = fma(values8, w3, acc2);
1771 acc2 = fma(values9, w4, acc2);
1772 acc2 = fma(values10, w5, acc2);
1773 acc3 = fma(values9, w3, acc3);
1774 acc3 = fma(values10, w4, acc3);
1775 acc3 = fma(values11, w5, acc3);
1776
1777 acc2 = fma(values12, w6, acc2);
1778 acc2 = fma(values13, w7, acc2);
1779 acc2 = fma(values14, w8, acc2);
1780 acc3 = fma(values13, w6, acc3);
1781 acc3 = fma(values14, w7, acc3);
1782 acc3 = fma(values15, w8, acc3);
1783
1784#if defined(HAS_BIAS)
1785 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1786
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001787 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001788
1789 acc0 += bias_values;
1790 acc1 += bias_values;
1791 acc2 += bias_values;
1792 acc3 += bias_values;
1793#endif // defined(HAS_BIAS)
1794
Georgios Pinitas37044642018-10-30 14:53:25 +00001795#if defined(DST_DEPTH)
1796 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1797#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001798 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001799#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001800
1801 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001802 (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001803 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001804 (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001805
1806#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1807 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1808#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1809 {
1810 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001811 (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001812 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001813 (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001814 }
1815}
1816
1817#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
giuros016d109962019-01-07 17:47:19 +00001818#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)