blob: 07e67f4f2c4b5426011ef2ccddecb08bc4c46e34 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Giorgio Arena9fe41442017-08-23 16:36:24 +010027#if defined(CONV_STRIDE_X)
28
Giorgio Arena93a690e2017-08-01 16:09:33 +010029#if CONV_STRIDE_X == 1
30#define convolution1x3 convolution1x3_stride_1
31#elif CONV_STRIDE_X == 2
32#define convolution1x3 convolution1x3_stride_2
33#elif CONV_STRIDE_X == 3
34#define convolution1x3 convolution1x3_stride_3
35#else /* CONV_STRIDE_X */
36#error "Stride not supported"
37#endif /* CONV_STRIDE_X */
38
39/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
40 *
41 * @param[in] left_pixel Pointer to the left pixel.
42 * @param[in] left_coeff Weight of the left pixel
43 * @param[in] middle_coeff Weight of the middle pixel
44 * @param[in] right_coeff Weight of the right pixel
45 *
46 * @return a float2 containing 2 convoluted values.
47 */
48inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
49 const float left_coeff,
50 const float middle_coeff,
51 const float right_coeff)
52{
53 float4 temp = vload4(0, (__global float *)left_pixel);
54
55 float2 left = CONVERT(temp.s01, float2);
56 float2 middle = CONVERT(temp.s12, float2);
57 float2 right = CONVERT(temp.s23, float2);
58
59 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
60}
61
62/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
63 *
64 * @param[in] left_pixel Pointer to the left pixel.
65 * @param[in] left_coeff Weight of the left pixel
66 * @param[in] middle_coeff Weight of the middle pixel
67 * @param[in] right_coeff Weight of the right pixel
68 *
69 * @return a float2 containing 2 convoluted values.
70 */
71inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
72 const float left_coeff,
73 const float middle_coeff,
74 const float right_coeff)
75{
76 float4 temp0 = vload4(0, (__global float *)left_pixel);
77 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
78
79 float2 left = CONVERT(temp0.s02, float2);
80 float2 middle = CONVERT(temp0.s13, float2);
81 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
82
83 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
84}
85
86/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
87 *
88 * @param[in] left_pixel Pointer to the left pixel.
89 * @param[in] left_coeff Weight of the left pixel
90 * @param[in] middle_coeff Weight of the middle pixel
91 * @param[in] right_coeff Weight of the right pixel
92 *
93 * @return a float2 containing 2 convoluted values.
94 */
95inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
96 const float left_coeff,
97 const float middle_coeff,
98 const float right_coeff)
99{
100 float4 temp0 = vload4(0, (__global float *)left_pixel);
101 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
102
103 float2 left = CONVERT(temp0.s03, float2);
104 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
105 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
106
107 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
108}
109
110/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
111 *
112 * Convolution matrix layout:
113 *
114 * [ mat0, mat1, mat2 ]\n
115 * [ mat3, mat4, mat5 ]\n
116 * [ mat6, mat7, mat8 ]\n
117 *
118 * @param[in] src A pointer to source Image structure
119 * @param[in] mat0 Coefficient from the convolution matrix
120 * @param[in] mat1 Coefficient from the convolution matrix
121 * @param[in] mat2 Coefficient from the convolution matrix
122 * @param[in] mat3 Coefficient from the convolution matrix
123 * @param[in] mat4 Coefficient from the convolution matrix
124 * @param[in] mat5 Coefficient from the convolution matrix
125 * @param[in] mat6 Coefficient from the convolution matrix
126 * @param[in] mat0 Coefficient from the convolution matrix
127 * @param[in] mat7 Coefficient from the convolution matrix
128 * @param[in] mat8 Coefficient from the convolution matrix
129 *
130 * @return a float2 containing 2 convoluted values.
131 */
132inline float2 convolution3x3(
133 Image *src,
134 const float mat0, const float mat1, const float mat2,
135 const float mat3, const float mat4, const float mat5,
136 const float mat6, const float mat7, const float mat8)
137{
138 float2 pixels;
139
140 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
141 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
142 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
143
144 return pixels;
145}
146
Gian Marcoc799ed82018-02-01 16:57:48 +0000147/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000148 *
Gian Marcoc799ed82018-02-01 16:57:48 +0000149 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000150 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
151 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
152 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
153 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
154 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
155 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
156 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000157 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000158 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
159 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
160 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
161 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
162 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
163 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
164 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000165 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000166 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
167 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
169 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
171 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
172 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
173 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
174 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
175 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
176 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
177 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100178__kernel void depthwise_convolution_3x3(
179 TENSOR3D_DECLARATION(src),
180 TENSOR3D_DECLARATION(dst),
181 TENSOR3D_DECLARATION(weights)
182#if defined(HAS_BIAS)
183 ,
184 VECTOR_DECLARATION(biases)
185#endif //defined(HAS_BIAS)
186)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100187{
188 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
189 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
190 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100191#if defined(HAS_BIAS)
192 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
193#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100194
195 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
196 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
197 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
198 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
199
200 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
201 weights_values1.s0, weights_values1.s1, weights_values1.s2,
202 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100203#if defined(HAS_BIAS)
204 pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
205#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100206
207 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100208}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100209#endif //defined(CONV_STRIDE_X)
210
Gian Marcoc799ed82018-02-01 16:57:48 +0000211#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
212 ({ \
213 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
214 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
215 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
216 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
217 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
218 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
219 })
220
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000221#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
222 ({ \
223 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
224 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
225 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
226 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
227 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
228 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
229 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
230 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
231 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
232 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
233 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
234 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
235 })
236
Gian Marcoc799ed82018-02-01 16:57:48 +0000237#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
238 ({ \
239 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
240 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
241 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
242 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
243 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
244 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
245 })
246
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000247#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
248 ({ \
249 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
250 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
251 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
252 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
253 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
254 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
255 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
256 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
257 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
258 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
259 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
260 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
261 })
262
Gian Marcoc799ed82018-02-01 16:57:48 +0000263/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
264 * stride_x and stride_y are equal to 1
265 *
266 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
267 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
268 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
269 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
270 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
271 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
272 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
273 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
274 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
275 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
276 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
277 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
278 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
279 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
280 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
281 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
282 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
283 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
284 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
285 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
286 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
287 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
288 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
289 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
290 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
291 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
292 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
293 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
294 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000295__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000296 TENSOR3D_DECLARATION(src),
297 TENSOR3D_DECLARATION(dst),
298 TENSOR3D_DECLARATION(weights)
299#if defined(HAS_BIAS)
300 ,
301 VECTOR_DECLARATION(biases)
302#endif //defined(HAS_BIAS)
303)
304{
305 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
306 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
307 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
308
309 float2 pixels0 = 0.0f;
310 float2 pixels1 = 0.0f;
311 float2 pixels2 = 0.0f;
312 float2 pixels3 = 0.0f;
313
314 __global uchar *weights_addr = (__global uchar *)weights.ptr;
315 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
316
317 // Load the weights
318 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
319 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
320 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
321
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000322 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000323 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
324 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
325 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
326 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000327 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
328 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000329
330 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
331 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
332 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
333 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
334 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
335 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
336 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
337 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
338 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
339 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
340 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
341 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
342
343#ifdef HAS_BIAS
344 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
345
346 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
347
348 pixels0 += (float2)bias;
349 pixels1 += (float2)bias;
350 pixels2 += (float2)bias;
351 pixels3 += (float2)bias;
352#endif /* defined(HAS_BIAS) */
353
354 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
355 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
356 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
357 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
358}
359
360/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
361 * stride_x and stride_y are equal to 2
362 *
363 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
364 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
365 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
366 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
367 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
368 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
369 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
370 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
371 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
372 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
373 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
374 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
375 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
376 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
377 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
378 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
379 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
380 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
381 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
382 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
383 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
384 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
385 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
386 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
387 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
388 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
389 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
390 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
391 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000392__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000393 TENSOR3D_DECLARATION(src),
394 TENSOR3D_DECLARATION(dst),
395 TENSOR3D_DECLARATION(weights)
396#if defined(HAS_BIAS)
397 ,
398 VECTOR_DECLARATION(biases)
399#endif //defined(HAS_BIAS)
400)
401{
402 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
403 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
404 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
405
406 float2 pixels0 = 0.0f;
407 float2 pixels1 = 0.0f;
408
409 __global uchar *weights_addr = (__global uchar *)weights.ptr;
410 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
411
412 // Load the weights
413 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
414 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
415 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
416
417 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
418 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
419 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
420 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
421 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
422 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
423 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
424 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
425 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
426 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
427 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
428
429 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
430 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
431 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
432 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
433 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
434 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
435
436#ifdef HAS_BIAS
437 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
438
439 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
440
441 pixels0 += (float2)bias;
442 pixels1 += (float2)bias;
443#endif /* defined(HAS_BIAS) */
444
445 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
446 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
447}
448
Giorgio Arena9fe41442017-08-23 16:36:24 +0100449#if defined(SRC_WIDTH) && defined(DATA_TYPE)
450/** This kernel reshapes each of the tensor's low three dimensions to single rows.
451 *
452 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
453 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100454 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
455 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
456 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
457 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
458 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
459 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
460 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
461 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
462 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
463 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
464 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
465 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
466 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
467 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
468 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
469 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
470 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
471 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100472 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100473__kernel void depthwise_weights_reshape(
474 TENSOR3D_DECLARATION(src),
475 IMAGE_DECLARATION(dst)
476#ifdef HAS_BIAS
477 ,
478 VECTOR_DECLARATION(biases)
479#endif /* HAS_BIAS */
480)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100481{
482 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100483#ifdef HAS_BIAS
484 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
485#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100486
487 __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
488 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
489
490 for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
491 {
492 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
493 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100494
495#if defined(HAS_BIAS)
496 if(get_global_id(1) == 0)
497 {
498 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
499 }
500#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100501}
502#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
503
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000504#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100505/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
506 *
507 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100508 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
Giorgio Arena9fe41442017-08-23 16:36:24 +0100509 *
510 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
511 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
512 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
513 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
514 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
515 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
516 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
517 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
518 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
519 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
520 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
521 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
522 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
523 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
524 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
525 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
526 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100527__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
528{
529 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
530
531 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100532 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100533 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
534
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100535 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
536 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100537 const int src_z = get_global_id(2);
538
539 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
540 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
541
542 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
543 {
544 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
545 {
546 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
547 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000548 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100549 }
550 else
551 {
552 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
553 }
554 }
555 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100556#if defined(HAS_BIAS)
557 *output_ptr = (DATA_TYPE)(1);
558#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100559}
560
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000561#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100562
563#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
564
565/** This kernel performs a reshaping of the output of the depthwise generic convolution.
566 *
567 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
568 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
569 *
570 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
571 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
572 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
573 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
574 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
575 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
576 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
577 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
578 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
579 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
580 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
581 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
582 */
583__kernel void depthwise_vector_to_tensor(
584 VECTOR_DECLARATION(src),
585 TENSOR3D_DECLARATION(dst))
586{
587 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
588
589 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
590 const int id0 = get_global_id(0);
591 const int z = id0 / patch_size;
592 const int index2D = id0 - z * patch_size;
593
594 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
595 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
596}
597
598#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000599
600#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
601#if defined(CONV_STRIDE_X)
602#if CONV_STRIDE_X == 1
603#define convolution1x3_f16 convolution1x3_stride_1_f16
604#elif CONV_STRIDE_X == 2
605#define convolution1x3_f16 convolution1x3_stride_2_f16
606#elif CONV_STRIDE_X == 3
607#define convolution1x3_f16 convolution1x3_stride_3_f16
608#else /* CONV_STRIDE_X */
609#error "Stride not supported"
610#endif /* CONV_STRIDE_X */
611
612/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
613 *
614 * @param[in] left_pixel Pointer to the left pixel.
615 * @param[in] left_coeff Weight of the left pixel
616 * @param[in] middle_coeff Weight of the middle pixel
617 * @param[in] right_coeff Weight of the right pixel
618 *
619 * @return a half4 containing 4 convoluted values.
620 */
621inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
622 const half left_coeff,
623 const half middle_coeff,
624 const half right_coeff)
625{
626 half8 temp = vload8(0, (__global half *)left_pixel);
627
628 half4 left = CONVERT(temp.s0123, half4);
629 half4 middle = CONVERT(temp.s1234, half4);
630 half4 right = CONVERT(temp.s2345, half4);
631
632 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
633}
634
635/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
636 *
637 * @param[in] left_pixel Pointer to the left pixel.
638 * @param[in] left_coeff Weight of the left pixel
639 * @param[in] middle_coeff Weight of the middle pixel
640 * @param[in] right_coeff Weight of the right pixel
641 *
642 * @return a half4 containing 4 convoluted values.
643 */
644inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
645 const half left_coeff,
646 const half middle_coeff,
647 const half right_coeff)
648{
649 half8 temp0 = vload8(0, (__global half *)left_pixel);
650 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
651
652 half4 left = CONVERT(temp0.s0246, half4);
653 half4 middle = CONVERT(temp0.s1357, half4);
654 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
655
656 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
657}
658
659/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
660 *
661 * @param[in] left_pixel Pointer to the left pixel.
662 * @param[in] left_coeff Weight of the left pixel
663 * @param[in] middle_coeff Weight of the middle pixel
664 * @param[in] right_coeff Weight of the right pixel
665 *
666 * @return a half4 containing 4 convoluted values.
667 */
668inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
669 const half left_coeff,
670 const half middle_coeff,
671 const half right_coeff)
672{
673 half16 temp0 = vload16(0, (__global half *)left_pixel);
674
675 half4 left = CONVERT(temp0.s0369, half4);
676 half4 middle = CONVERT(temp0.s147A, half4);
677 half4 right = CONVERT(temp0.s258B, half4);
678
679 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
680}
681
682/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
683 *
684 * Convolution matrix layout:
685 *
686 * [ mat0, mat1, mat2 ]\n
687 * [ mat3, mat4, mat5 ]\n
688 * [ mat6, mat7, mat8 ]\n
689 *
690 * @param[in] src A pointer to source Image structure
691 * @param[in] mat0 Coefficient from the convolution matrix
692 * @param[in] mat1 Coefficient from the convolution matrix
693 * @param[in] mat2 Coefficient from the convolution matrix
694 * @param[in] mat3 Coefficient from the convolution matrix
695 * @param[in] mat4 Coefficient from the convolution matrix
696 * @param[in] mat5 Coefficient from the convolution matrix
697 * @param[in] mat6 Coefficient from the convolution matrix
698 * @param[in] mat0 Coefficient from the convolution matrix
699 * @param[in] mat7 Coefficient from the convolution matrix
700 * @param[in] mat8 Coefficient from the convolution matrix
701 *
702 * @return a half4 containing 4 convoluted values.
703 */
704inline half4 convolution3x3_f16(
705 Image *src,
706 const half mat0, const half mat1, const half mat2,
707 const half mat3, const half mat4, const half mat5,
708 const half mat6, const half mat7, const half mat8)
709{
710 half4 pixels;
711
712 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
713 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
714 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
715
716 return pixels;
717}
718
719/** This OpenCL kernel computes the depthwise convolution 3x3
720 *
721 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
722 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
723 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
724 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
725 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
726 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
727 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
728 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000729 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000730 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
731 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
732 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
733 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
734 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
735 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
736 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000737 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000738 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
739 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
740 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
741 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
742 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
743 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
744 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
745 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
746 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
747 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
748 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
749 */
750__kernel void depthwise_convolution_3x3_f16(
751 TENSOR3D_DECLARATION(src),
752 TENSOR3D_DECLARATION(dst),
753 TENSOR3D_DECLARATION(weights)
754#if defined(HAS_BIAS)
755 ,
756 VECTOR_DECLARATION(biases)
757#endif //defined(HAS_BIAS)
758)
759{
760 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
761 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
762 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
763#if defined(HAS_BIAS)
764 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
765#endif //defined(HAS_BIAS)
766
767 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
768 half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
769 half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
770 half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
771
772 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
773 weights_values1.s0, weights_values1.s1, weights_values1.s2,
774 weights_values2.s0, weights_values2.s1, weights_values2.s2);
775#if defined(HAS_BIAS)
776 pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
777#endif //defined(HAS_BIAS)
778
779 vstore4(pixels, 0, (__global half *)dst.ptr);
780}
781#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000782
783/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
784 * when both stride_x and stride_y are equal to 1
785 *
786 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
787 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
788 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
789 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
790 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
791 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
792 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
793 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
794 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
795 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
796 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
797 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
798 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
799 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
800 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
801 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
802 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
803 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
804 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
805 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
806 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
807 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
808 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
809 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
810 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
811 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
812 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
813 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
814 */
815__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
816 TENSOR3D_DECLARATION(src),
817 TENSOR3D_DECLARATION(dst),
818 TENSOR3D_DECLARATION(weights)
819#if defined(HAS_BIAS)
820 ,
821 VECTOR_DECLARATION(biases)
822#endif //defined(HAS_BIAS)
823)
824{
825 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
826 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
827 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
828
829#ifdef HAS_BIAS
830 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
831
832 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
833#endif /* defined(HAS_BIAS) */
834
835 half4 pixels0 = 0.0f;
836 half4 pixels1 = 0.0f;
837 half4 pixels2 = 0.0f;
838 half4 pixels3 = 0.0f;
839
840 __global uchar *weights_addr = (__global uchar *)weights.ptr;
841 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
842
843 // Load the weights
844 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
845 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
846 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
847
848 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
849 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
850 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
851 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
852 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
853 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
854 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
855
856 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
857 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
858 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
859 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
860 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
861 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
862 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
863 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
864 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
865 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
866 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
867 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
868
869#ifdef HAS_BIAS
870 pixels0 += (half4)bias;
871 pixels1 += (half4)bias;
872 pixels2 += (half4)bias;
873 pixels3 += (half4)bias;
874#endif /* defined(HAS_BIAS) */
875
876 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
877 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
878 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
879 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
880}
881
882/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
883 * when both stride_x and stride_y are equal to 2
884 *
885 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
886 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
887 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
888 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
889 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
890 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
891 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
892 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
893 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
894 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
895 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
896 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
897 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
898 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
899 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
900 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
901 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
902 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
903 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
904 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
905 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
906 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
907 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
908 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
909 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
910 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
911 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
912 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
913 */
914__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
915 TENSOR3D_DECLARATION(src),
916 TENSOR3D_DECLARATION(dst),
917 TENSOR3D_DECLARATION(weights)
918#if defined(HAS_BIAS)
919 ,
920 VECTOR_DECLARATION(biases)
921#endif //defined(HAS_BIAS)
922)
923{
924 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
925 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
926 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
927
928#ifdef HAS_BIAS
929 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
930
931 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
932#endif /* defined(HAS_BIAS) */
933
934 half4 pixels0 = 0.0f;
935 half4 pixels1 = 0.0f;
936
937 __global uchar *weights_addr = (__global uchar *)weights.ptr;
938 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
939
940 // Load the weights
941 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
942 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
943 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
944
945 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
946 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
947 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
948 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
949 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
950 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
951 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
952 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
953 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
954 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
955 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
956
957 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
958 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
959 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
960 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
961 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
962 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
963
964#ifdef HAS_BIAS
965 pixels0 += (half4)bias;
966 pixels1 += (half4)bias;
967#endif /* defined(HAS_BIAS) */
968
969 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
970 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
971}
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000972#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)