blob: 21c28539ef80d9b02e8372fa5ff85cdef648599c [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Giorgio Arena76572242018-04-04 17:44:26 +010027#if defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +010028#if defined(CONV_STRIDE_X)
29
Giorgio Arena93a690e2017-08-01 16:09:33 +010030#if CONV_STRIDE_X == 1
31#define convolution1x3 convolution1x3_stride_1
32#elif CONV_STRIDE_X == 2
33#define convolution1x3 convolution1x3_stride_2
34#elif CONV_STRIDE_X == 3
35#define convolution1x3 convolution1x3_stride_3
36#else /* CONV_STRIDE_X */
37#error "Stride not supported"
38#endif /* CONV_STRIDE_X */
39
40/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
41 *
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
46 *
47 * @return a float2 containing 2 convoluted values.
48 */
49inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
53{
54 float4 temp = vload4(0, (__global float *)left_pixel);
55
56 float2 left = CONVERT(temp.s01, float2);
57 float2 middle = CONVERT(temp.s12, float2);
58 float2 right = CONVERT(temp.s23, float2);
59
60 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
61}
62
63/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
64 *
65 * @param[in] left_pixel Pointer to the left pixel.
66 * @param[in] left_coeff Weight of the left pixel
67 * @param[in] middle_coeff Weight of the middle pixel
68 * @param[in] right_coeff Weight of the right pixel
69 *
70 * @return a float2 containing 2 convoluted values.
71 */
72inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
73 const float left_coeff,
74 const float middle_coeff,
75 const float right_coeff)
76{
77 float4 temp0 = vload4(0, (__global float *)left_pixel);
78 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
79
80 float2 left = CONVERT(temp0.s02, float2);
81 float2 middle = CONVERT(temp0.s13, float2);
82 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
83
84 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
85}
86
87/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
88 *
89 * @param[in] left_pixel Pointer to the left pixel.
90 * @param[in] left_coeff Weight of the left pixel
91 * @param[in] middle_coeff Weight of the middle pixel
92 * @param[in] right_coeff Weight of the right pixel
93 *
94 * @return a float2 containing 2 convoluted values.
95 */
96inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
97 const float left_coeff,
98 const float middle_coeff,
99 const float right_coeff)
100{
101 float4 temp0 = vload4(0, (__global float *)left_pixel);
102 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
103
104 float2 left = CONVERT(temp0.s03, float2);
105 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
106 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
107
108 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
109}
110
111/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
112 *
113 * Convolution matrix layout:
114 *
115 * [ mat0, mat1, mat2 ]\n
116 * [ mat3, mat4, mat5 ]\n
117 * [ mat6, mat7, mat8 ]\n
118 *
119 * @param[in] src A pointer to source Image structure
120 * @param[in] mat0 Coefficient from the convolution matrix
121 * @param[in] mat1 Coefficient from the convolution matrix
122 * @param[in] mat2 Coefficient from the convolution matrix
123 * @param[in] mat3 Coefficient from the convolution matrix
124 * @param[in] mat4 Coefficient from the convolution matrix
125 * @param[in] mat5 Coefficient from the convolution matrix
126 * @param[in] mat6 Coefficient from the convolution matrix
127 * @param[in] mat0 Coefficient from the convolution matrix
128 * @param[in] mat7 Coefficient from the convolution matrix
129 * @param[in] mat8 Coefficient from the convolution matrix
130 *
131 * @return a float2 containing 2 convoluted values.
132 */
133inline float2 convolution3x3(
134 Image *src,
135 const float mat0, const float mat1, const float mat2,
136 const float mat3, const float mat4, const float mat5,
137 const float mat6, const float mat7, const float mat8)
138{
139 float2 pixels;
140
141 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
142 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
143 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
144
145 return pixels;
146}
147
Gian Marcoc799ed82018-02-01 16:57:48 +0000148/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000149 *
Gian Marcoc799ed82018-02-01 16:57:48 +0000150 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000151 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
153 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000166 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000167 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
168 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
170 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
172 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
173 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
178 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100179__kernel void depthwise_convolution_3x3(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(weights)
183#if defined(HAS_BIAS)
184 ,
185 VECTOR_DECLARATION(biases)
186#endif //defined(HAS_BIAS)
187)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100188{
189 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
190 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
191 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100192#if defined(HAS_BIAS)
193 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
194#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100195
Giorgio Arena76572242018-04-04 17:44:26 +0100196 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
197
Giorgio Arena93a690e2017-08-01 16:09:33 +0100198 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
199 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
200 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
201 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
202
203 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
204 weights_values1.s0, weights_values1.s1, weights_values1.s2,
205 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100206#if defined(HAS_BIAS)
207 pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
208#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209
210 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100211}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100212#endif //defined(CONV_STRIDE_X)
213
Gian Marcoc799ed82018-02-01 16:57:48 +0000214#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
215 ({ \
216 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
217 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
218 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
219 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
220 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
221 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
222 })
223
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000224#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
225 ({ \
226 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
227 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
228 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
229 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
230 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
231 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
232 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
233 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
234 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
235 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
236 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
237 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
238 })
239
Gian Marcoc799ed82018-02-01 16:57:48 +0000240#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
241 ({ \
242 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
243 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
244 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
245 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
246 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
247 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
248 })
249
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000250#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
251 ({ \
252 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
253 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
254 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
255 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
256 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
257 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
258 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
259 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
260 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
261 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
262 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
263 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
264 })
265
Gian Marcoc799ed82018-02-01 16:57:48 +0000266/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
267 * stride_x and stride_y are equal to 1
268 *
269 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
270 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
271 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
272 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
273 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
274 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
275 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
276 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
277 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
283 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
285 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
286 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
287 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
289 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
291 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
292 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
293 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
294 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
295 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
297 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000298__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000299 TENSOR3D_DECLARATION(src),
300 TENSOR3D_DECLARATION(dst),
301 TENSOR3D_DECLARATION(weights)
302#if defined(HAS_BIAS)
303 ,
304 VECTOR_DECLARATION(biases)
305#endif //defined(HAS_BIAS)
306)
307{
308 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
309 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
310 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
311
312 float2 pixels0 = 0.0f;
313 float2 pixels1 = 0.0f;
314 float2 pixels2 = 0.0f;
315 float2 pixels3 = 0.0f;
316
317 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100318 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000319
320 // Load the weights
321 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
322 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
323 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
324
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000325 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000326 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
327 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
328 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
329 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000330 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
331 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000332
333 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
334 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
335 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
336 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
337 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
338 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
339 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
340 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
341 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
342 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
343 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
344 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
345
346#ifdef HAS_BIAS
347 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
348
349 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
350
351 pixels0 += (float2)bias;
352 pixels1 += (float2)bias;
353 pixels2 += (float2)bias;
354 pixels3 += (float2)bias;
355#endif /* defined(HAS_BIAS) */
356
357 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
358 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
359 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
360 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
361}
362
363/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
364 * stride_x and stride_y are equal to 2
365 *
366 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
367 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
368 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
369 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
370 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
371 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
372 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
373 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
374 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
375 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
376 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
377 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
378 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
379 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
380 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
381 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
382 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
383 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
384 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
385 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
386 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
387 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
388 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
389 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
390 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
391 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
392 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
393 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
394 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000395__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000396 TENSOR3D_DECLARATION(src),
397 TENSOR3D_DECLARATION(dst),
398 TENSOR3D_DECLARATION(weights)
399#if defined(HAS_BIAS)
400 ,
401 VECTOR_DECLARATION(biases)
402#endif //defined(HAS_BIAS)
403)
404{
405 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
406 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
407 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
408
409 float2 pixels0 = 0.0f;
410 float2 pixels1 = 0.0f;
411
412 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100413 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000414
415 // Load the weights
416 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
417 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
418 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
419
420 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
421 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
422 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
423 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
424 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
425 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
426 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
427 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
428 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
429 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
430 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
431
432 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
433 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
434 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
435 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
436 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
437 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
438
439#ifdef HAS_BIAS
440 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
441
442 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
443
444 pixels0 += (float2)bias;
445 pixels1 += (float2)bias;
446#endif /* defined(HAS_BIAS) */
447
448 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
449 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
450}
451
Giorgio Arena76572242018-04-04 17:44:26 +0100452#endif // defined(DEPTH_MULTIPLIER)
453
Giorgio Arena9fe41442017-08-23 16:36:24 +0100454#if defined(SRC_WIDTH) && defined(DATA_TYPE)
455/** This kernel reshapes each of the tensor's low three dimensions to single rows.
456 *
457 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
458 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100459 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
460 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
461 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
462 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
463 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
464 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
465 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
466 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
467 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
468 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
469 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
470 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
471 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
472 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
473 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
474 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
475 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100477 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100478__kernel void depthwise_weights_reshape(
479 TENSOR3D_DECLARATION(src),
480 IMAGE_DECLARATION(dst)
481#ifdef HAS_BIAS
482 ,
483 VECTOR_DECLARATION(biases)
484#endif /* HAS_BIAS */
485)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100486{
487 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100488#ifdef HAS_BIAS
489 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
490#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100491
492 __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
493 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
494
495 for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
496 {
497 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
498 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100499
500#if defined(HAS_BIAS)
501 if(get_global_id(1) == 0)
502 {
503 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
504 }
505#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100506}
507#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
508
Giorgio Arena76572242018-04-04 17:44:26 +0100509#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100510/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
511 *
512 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100513 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Giorgio Arena9fe41442017-08-23 16:36:24 +0100514 *
515 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
516 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
517 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
518 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
519 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
520 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
521 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
522 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
523 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
524 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
525 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
526 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
527 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
528 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
529 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
530 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
531 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100532__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
533{
534 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
535
536 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100537 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100538 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
539
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100540 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
541 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100542 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100543
544 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
545 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
546
547 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
548 {
549 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
550 {
551 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
552 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000553 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100554 }
555 else
556 {
557 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
558 }
559 }
560 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100561#if defined(HAS_BIAS)
562 *output_ptr = (DATA_TYPE)(1);
563#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100564}
565
Giorgio Arena76572242018-04-04 17:44:26 +0100566#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100567
568#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
569
570/** This kernel performs a reshaping of the output of the depthwise generic convolution.
571 *
572 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
573 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
574 *
575 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
576 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
577 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
578 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
579 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
580 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
581 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
582 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
583 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
584 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
585 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
586 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
587 */
588__kernel void depthwise_vector_to_tensor(
589 VECTOR_DECLARATION(src),
590 TENSOR3D_DECLARATION(dst))
591{
592 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
593
594 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
595 const int id0 = get_global_id(0);
596 const int z = id0 / patch_size;
597 const int index2D = id0 - z * patch_size;
598
599 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
600 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
601}
602
603#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000604
Giorgio Arena76572242018-04-04 17:44:26 +0100605#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000606#if defined(CONV_STRIDE_X)
607#if CONV_STRIDE_X == 1
608#define convolution1x3_f16 convolution1x3_stride_1_f16
609#elif CONV_STRIDE_X == 2
610#define convolution1x3_f16 convolution1x3_stride_2_f16
611#elif CONV_STRIDE_X == 3
612#define convolution1x3_f16 convolution1x3_stride_3_f16
613#else /* CONV_STRIDE_X */
614#error "Stride not supported"
615#endif /* CONV_STRIDE_X */
616
617/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
618 *
619 * @param[in] left_pixel Pointer to the left pixel.
620 * @param[in] left_coeff Weight of the left pixel
621 * @param[in] middle_coeff Weight of the middle pixel
622 * @param[in] right_coeff Weight of the right pixel
623 *
624 * @return a half4 containing 4 convoluted values.
625 */
626inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
627 const half left_coeff,
628 const half middle_coeff,
629 const half right_coeff)
630{
631 half8 temp = vload8(0, (__global half *)left_pixel);
632
633 half4 left = CONVERT(temp.s0123, half4);
634 half4 middle = CONVERT(temp.s1234, half4);
635 half4 right = CONVERT(temp.s2345, half4);
636
637 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
638}
639
640/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
641 *
642 * @param[in] left_pixel Pointer to the left pixel.
643 * @param[in] left_coeff Weight of the left pixel
644 * @param[in] middle_coeff Weight of the middle pixel
645 * @param[in] right_coeff Weight of the right pixel
646 *
647 * @return a half4 containing 4 convoluted values.
648 */
649inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
650 const half left_coeff,
651 const half middle_coeff,
652 const half right_coeff)
653{
654 half8 temp0 = vload8(0, (__global half *)left_pixel);
655 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
656
657 half4 left = CONVERT(temp0.s0246, half4);
658 half4 middle = CONVERT(temp0.s1357, half4);
659 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
660
661 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
662}
663
664/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
665 *
666 * @param[in] left_pixel Pointer to the left pixel.
667 * @param[in] left_coeff Weight of the left pixel
668 * @param[in] middle_coeff Weight of the middle pixel
669 * @param[in] right_coeff Weight of the right pixel
670 *
671 * @return a half4 containing 4 convoluted values.
672 */
673inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
674 const half left_coeff,
675 const half middle_coeff,
676 const half right_coeff)
677{
678 half16 temp0 = vload16(0, (__global half *)left_pixel);
679
680 half4 left = CONVERT(temp0.s0369, half4);
681 half4 middle = CONVERT(temp0.s147A, half4);
682 half4 right = CONVERT(temp0.s258B, half4);
683
684 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
685}
686
687/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
688 *
689 * Convolution matrix layout:
690 *
691 * [ mat0, mat1, mat2 ]\n
692 * [ mat3, mat4, mat5 ]\n
693 * [ mat6, mat7, mat8 ]\n
694 *
695 * @param[in] src A pointer to source Image structure
696 * @param[in] mat0 Coefficient from the convolution matrix
697 * @param[in] mat1 Coefficient from the convolution matrix
698 * @param[in] mat2 Coefficient from the convolution matrix
699 * @param[in] mat3 Coefficient from the convolution matrix
700 * @param[in] mat4 Coefficient from the convolution matrix
701 * @param[in] mat5 Coefficient from the convolution matrix
702 * @param[in] mat6 Coefficient from the convolution matrix
703 * @param[in] mat0 Coefficient from the convolution matrix
704 * @param[in] mat7 Coefficient from the convolution matrix
705 * @param[in] mat8 Coefficient from the convolution matrix
706 *
707 * @return a half4 containing 4 convoluted values.
708 */
709inline half4 convolution3x3_f16(
710 Image *src,
711 const half mat0, const half mat1, const half mat2,
712 const half mat3, const half mat4, const half mat5,
713 const half mat6, const half mat7, const half mat8)
714{
715 half4 pixels;
716
717 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
718 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
719 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
720
721 return pixels;
722}
723
Giorgio Arena76572242018-04-04 17:44:26 +0100724#if defined(DEPTH_MULTIPLIER)
725
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000726/** This OpenCL kernel computes the depthwise convolution 3x3
727 *
728 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
729 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
730 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
731 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
732 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
733 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
734 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
735 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000736 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000737 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
738 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
739 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
740 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
741 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
742 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
743 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000744 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000745 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
746 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
747 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
748 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
749 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
750 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
751 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
752 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
753 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
754 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
755 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
756 */
757__kernel void depthwise_convolution_3x3_f16(
758 TENSOR3D_DECLARATION(src),
759 TENSOR3D_DECLARATION(dst),
760 TENSOR3D_DECLARATION(weights)
761#if defined(HAS_BIAS)
762 ,
763 VECTOR_DECLARATION(biases)
764#endif //defined(HAS_BIAS)
765)
766{
767 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
768 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
769 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
770#if defined(HAS_BIAS)
771 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
772#endif //defined(HAS_BIAS)
773
Giorgio Arena76572242018-04-04 17:44:26 +0100774 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
775
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000776 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
777 half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
778 half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
779 half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
780
781 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
782 weights_values1.s0, weights_values1.s1, weights_values1.s2,
783 weights_values2.s0, weights_values2.s1, weights_values2.s2);
784#if defined(HAS_BIAS)
785 pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
786#endif //defined(HAS_BIAS)
787
788 vstore4(pixels, 0, (__global half *)dst.ptr);
789}
Giorgio Arena76572242018-04-04 17:44:26 +0100790#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000791#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000792
793/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
794 * when both stride_x and stride_y are equal to 1
795 *
796 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
797 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
798 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
799 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
800 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
801 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
802 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
803 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
804 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
805 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
806 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
807 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
808 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
809 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
810 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
811 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
812 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
813 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
814 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
815 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
816 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
817 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
818 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
819 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
820 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
821 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
822 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
823 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
824 */
825__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
826 TENSOR3D_DECLARATION(src),
827 TENSOR3D_DECLARATION(dst),
828 TENSOR3D_DECLARATION(weights)
829#if defined(HAS_BIAS)
830 ,
831 VECTOR_DECLARATION(biases)
832#endif //defined(HAS_BIAS)
833)
834{
835 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
836 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
837 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
838
839#ifdef HAS_BIAS
840 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
841
842 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
843#endif /* defined(HAS_BIAS) */
844
845 half4 pixels0 = 0.0f;
846 half4 pixels1 = 0.0f;
847 half4 pixels2 = 0.0f;
848 half4 pixels3 = 0.0f;
849
850 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100851 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000852
853 // Load the weights
854 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
855 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
856 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
857
858 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
859 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
860 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
861 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
862 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
863 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
864 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
865
866 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
867 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
868 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
869 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
870 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
871 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
872 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
873 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
874 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
875 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
876 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
877 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
878
879#ifdef HAS_BIAS
880 pixels0 += (half4)bias;
881 pixels1 += (half4)bias;
882 pixels2 += (half4)bias;
883 pixels3 += (half4)bias;
884#endif /* defined(HAS_BIAS) */
885
886 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
887 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
888 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
889 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
890}
891
892/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
893 * when both stride_x and stride_y are equal to 2
894 *
895 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
896 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
897 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
898 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
899 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
900 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
901 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
902 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
903 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
904 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
905 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
906 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
907 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
908 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
909 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
910 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
911 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
912 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
913 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
914 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
915 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
916 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
917 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
918 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
919 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
920 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
921 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
922 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
923 */
924__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
925 TENSOR3D_DECLARATION(src),
926 TENSOR3D_DECLARATION(dst),
927 TENSOR3D_DECLARATION(weights)
928#if defined(HAS_BIAS)
929 ,
930 VECTOR_DECLARATION(biases)
931#endif //defined(HAS_BIAS)
932)
933{
934 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
935 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
936 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
937
938#ifdef HAS_BIAS
939 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
940
941 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
942#endif /* defined(HAS_BIAS) */
943
944 half4 pixels0 = 0.0f;
945 half4 pixels1 = 0.0f;
946
947 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100948 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000949
950 // Load the weights
951 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
952 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
953 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
954
955 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
956 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
957 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
958 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
959 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
960 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
961 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
962 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
963 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
964 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
965 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
966
967 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
968 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
969 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
970 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
971 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
972 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
973
974#ifdef HAS_BIAS
975 pixels0 += (half4)bias;
976 pixels1 += (half4)bias;
977#endif /* defined(HAS_BIAS) */
978
979 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
980 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
981}
Giorgio Arena76572242018-04-04 17:44:26 +0100982#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)