blob: bfaa92be1004bdec360ef4ea7ec64da670dffdc5 [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Georgios Pinitase55b40a2018-09-13 17:20:04 +010027#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena9fe41442017-08-23 16:36:24 +010028#if defined(CONV_STRIDE_X)
29
Giorgio Arena93a690e2017-08-01 16:09:33 +010030#if CONV_STRIDE_X == 1
31#define convolution1x3 convolution1x3_stride_1
32#elif CONV_STRIDE_X == 2
33#define convolution1x3 convolution1x3_stride_2
34#elif CONV_STRIDE_X == 3
35#define convolution1x3 convolution1x3_stride_3
36#else /* CONV_STRIDE_X */
37#error "Stride not supported"
38#endif /* CONV_STRIDE_X */
39
40/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
41 *
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
46 *
47 * @return a float2 containing 2 convoluted values.
48 */
49inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
53{
54 float4 temp = vload4(0, (__global float *)left_pixel);
55
56 float2 left = CONVERT(temp.s01, float2);
57 float2 middle = CONVERT(temp.s12, float2);
58 float2 right = CONVERT(temp.s23, float2);
59
60 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
61}
62
63/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
64 *
65 * @param[in] left_pixel Pointer to the left pixel.
66 * @param[in] left_coeff Weight of the left pixel
67 * @param[in] middle_coeff Weight of the middle pixel
68 * @param[in] right_coeff Weight of the right pixel
69 *
70 * @return a float2 containing 2 convoluted values.
71 */
72inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
73 const float left_coeff,
74 const float middle_coeff,
75 const float right_coeff)
76{
77 float4 temp0 = vload4(0, (__global float *)left_pixel);
78 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
79
80 float2 left = CONVERT(temp0.s02, float2);
81 float2 middle = CONVERT(temp0.s13, float2);
82 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
83
84 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
85}
86
87/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
88 *
89 * @param[in] left_pixel Pointer to the left pixel.
90 * @param[in] left_coeff Weight of the left pixel
91 * @param[in] middle_coeff Weight of the middle pixel
92 * @param[in] right_coeff Weight of the right pixel
93 *
94 * @return a float2 containing 2 convoluted values.
95 */
96inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
97 const float left_coeff,
98 const float middle_coeff,
99 const float right_coeff)
100{
101 float4 temp0 = vload4(0, (__global float *)left_pixel);
102 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
103
104 float2 left = CONVERT(temp0.s03, float2);
105 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
106 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
107
108 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
109}
110
111/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
112 *
113 * Convolution matrix layout:
114 *
115 * [ mat0, mat1, mat2 ]\n
116 * [ mat3, mat4, mat5 ]\n
117 * [ mat6, mat7, mat8 ]\n
118 *
119 * @param[in] src A pointer to source Image structure
120 * @param[in] mat0 Coefficient from the convolution matrix
121 * @param[in] mat1 Coefficient from the convolution matrix
122 * @param[in] mat2 Coefficient from the convolution matrix
123 * @param[in] mat3 Coefficient from the convolution matrix
124 * @param[in] mat4 Coefficient from the convolution matrix
125 * @param[in] mat5 Coefficient from the convolution matrix
126 * @param[in] mat6 Coefficient from the convolution matrix
127 * @param[in] mat0 Coefficient from the convolution matrix
128 * @param[in] mat7 Coefficient from the convolution matrix
129 * @param[in] mat8 Coefficient from the convolution matrix
130 *
131 * @return a float2 containing 2 convoluted values.
132 */
133inline float2 convolution3x3(
134 Image *src,
135 const float mat0, const float mat1, const float mat2,
136 const float mat3, const float mat4, const float mat5,
137 const float mat6, const float mat7, const float mat8)
138{
139 float2 pixels;
140
141 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
142 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
143 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
144
145 return pixels;
146}
147
Gian Marcoc799ed82018-02-01 16:57:48 +0000148/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000149 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000150 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
151 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000153 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000166 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000167 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
168 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
170 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
172 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
173 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
178 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100179__kernel void depthwise_convolution_3x3(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(weights)
183#if defined(HAS_BIAS)
184 ,
185 VECTOR_DECLARATION(biases)
186#endif //defined(HAS_BIAS)
187)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100188{
189 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
190 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100191 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100192#if defined(HAS_BIAS)
193 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
194#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100195
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100196 // Extract channel and linearized batch indices
197 const int channel = get_global_id(2) % DST_CHANNELS;
198 const int batch = get_global_id(2) / DST_CHANNELS;
199 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
200 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
201 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100202
Giorgio Arena93a690e2017-08-01 16:09:33 +0100203 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100204 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
205 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
206 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
Giorgio Arena93a690e2017-08-01 16:09:33 +0100207
208 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
209 weights_values1.s0, weights_values1.s1, weights_values1.s2,
210 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100211#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100212 pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100213#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100214
215 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100216}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100217#endif //defined(CONV_STRIDE_X)
218
Gian Marcoc799ed82018-02-01 16:57:48 +0000219#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
220 ({ \
221 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
222 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
223 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
224 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
225 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
226 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
227 })
228
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000229#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
230 ({ \
231 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
232 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
233 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
234 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
235 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
236 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
237 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
238 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
239 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
240 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
241 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
242 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
243 })
244
Gian Marcoc799ed82018-02-01 16:57:48 +0000245#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
246 ({ \
247 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
248 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
249 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
250 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
251 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
252 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
253 })
254
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000255#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
256 ({ \
257 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
258 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
259 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
260 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
261 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
262 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
263 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
264 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
265 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
266 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
267 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
268 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
269 })
270
Gian Marcoc799ed82018-02-01 16:57:48 +0000271/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
272 * stride_x and stride_y are equal to 1
273 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000274 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
275 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000276 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000277 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000278 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000279 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000280 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
281 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
283 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
284 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
285 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
286 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
287 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
288 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
289 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
290 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
291 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
292 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
293 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
294 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
295 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
296 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
297 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
298 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
299 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
300 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
301 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
302 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000303__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000304 TENSOR3D_DECLARATION(src),
305 TENSOR3D_DECLARATION(dst),
306 TENSOR3D_DECLARATION(weights)
307#if defined(HAS_BIAS)
308 ,
309 VECTOR_DECLARATION(biases)
310#endif //defined(HAS_BIAS)
311)
312{
313 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
314 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100315 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000316
317 float2 pixels0 = 0.0f;
318 float2 pixels1 = 0.0f;
319 float2 pixels2 = 0.0f;
320 float2 pixels3 = 0.0f;
321
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100322 // Extract channel and linearized batch indices
323 const int channel = get_global_id(2) % DST_CHANNELS;
324 const int batch = get_global_id(2) / DST_CHANNELS;
325 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
326 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
327 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000328
329 // Load the weights
330 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
331 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
332 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
333
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000334 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000335 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
336 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
337 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
338 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000339 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
340 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000341
342 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
343 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
344 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
345 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
346 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
347 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
348 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
349 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
350 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
351 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
352 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
353 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
354
355#ifdef HAS_BIAS
356 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
357
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100358 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000359
360 pixels0 += (float2)bias;
361 pixels1 += (float2)bias;
362 pixels2 += (float2)bias;
363 pixels3 += (float2)bias;
364#endif /* defined(HAS_BIAS) */
365
366 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
367 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
368 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
369 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
370}
371
372/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
373 * stride_x and stride_y are equal to 2
374 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000375 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
376 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000377 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000378 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000379 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000380 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000381 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
382 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
383 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
384 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
385 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
386 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
387 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
388 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
389 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
390 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
391 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
392 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
393 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
394 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
395 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
396 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
397 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
398 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
399 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
400 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
401 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
402 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
403 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000404__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000405 TENSOR3D_DECLARATION(src),
406 TENSOR3D_DECLARATION(dst),
407 TENSOR3D_DECLARATION(weights)
408#if defined(HAS_BIAS)
409 ,
410 VECTOR_DECLARATION(biases)
411#endif //defined(HAS_BIAS)
412)
413{
414 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
415 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100416 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000417
418 float2 pixels0 = 0.0f;
419 float2 pixels1 = 0.0f;
420
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100421 // Extract channel and linearized batch indices
Georgios Pinitas37044642018-10-30 14:53:25 +0000422 const int channel = get_global_id(2) % DST_CHANNELS;
423 const int batch = get_global_id(2) / DST_CHANNELS;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100424 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
425 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
426 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000427
428 // Load the weights
429 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
430 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
431 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
432
433 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
434 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
435 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
436 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
437 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
438 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
439 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
440 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
441 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
442 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
443 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
444
445 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
446 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
447 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
451
452#ifdef HAS_BIAS
453 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
454
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100455 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000456
457 pixels0 += (float2)bias;
458 pixels1 += (float2)bias;
459#endif /* defined(HAS_BIAS) */
460
461 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
462 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
463}
464
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100465#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena76572242018-04-04 17:44:26 +0100466
Giorgio Arenad051e972018-06-20 11:46:42 +0100467#if defined(NCHW)
468#define in_stride_x src_stride_x
469#define in_stride_y src_stride_y
470#define in_stride_z src_stride_z
471#define out_stride_x dst_stride_x
472#define out_stride_y dst_stride_y
473#define out_stride_z dst_stride_z
474#else //defined(NCHW)
475#define in_stride_x src_stride_y
476#define in_stride_y src_stride_z
477#define in_stride_z src_stride_x
478#define out_stride_x dst_stride_y
479#define out_stride_y dst_stride_z
480#define out_stride_z dst_stride_x
481#endif //defined(NCHW)
482
Giorgio Arena9fe41442017-08-23 16:36:24 +0100483#if defined(SRC_WIDTH) && defined(DATA_TYPE)
484/** This kernel reshapes each of the tensor's low three dimensions to single rows.
485 *
486 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
487 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100488 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
489 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
490 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
491 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
492 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
493 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
494 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
495 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
496 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
497 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
498 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
499 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
500 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
501 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
502 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
503 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
504 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
505 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100506 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100507__kernel void depthwise_weights_reshape(
508 TENSOR3D_DECLARATION(src),
509 IMAGE_DECLARATION(dst)
510#ifdef HAS_BIAS
511 ,
512 VECTOR_DECLARATION(biases)
513#endif /* HAS_BIAS */
514)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100515{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100516#ifdef HAS_BIAS
517 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
518#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100519
Giorgio Arenad051e972018-06-20 11:46:42 +0100520 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
521 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100522
Giorgio Arenad051e972018-06-20 11:46:42 +0100523 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100524 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100525 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100526 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100527
528#if defined(HAS_BIAS)
529 if(get_global_id(1) == 0)
530 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100531 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100532 }
533#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100534}
535#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
536
Giorgio Arena76572242018-04-04 17:44:26 +0100537#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100538/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
539 *
540 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100541 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Giorgio Arena9fe41442017-08-23 16:36:24 +0100542 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100543 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100544 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
545 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
546 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
547 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
548 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
549 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
550 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
551 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
552 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
553 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
554 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
555 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
556 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
557 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
559 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100560__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
561{
562 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
563
564 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100565 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100566 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
567
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100568 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
569 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100570 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100571
Giorgio Arenad051e972018-06-20 11:46:42 +0100572 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100573 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
574
575 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
576 {
577 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
578 {
579 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
580 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000581 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100582 }
583 else
584 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100585 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100586 }
587 }
588 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100589#if defined(HAS_BIAS)
590 *output_ptr = (DATA_TYPE)(1);
591#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100592}
593
Giorgio Arena76572242018-04-04 17:44:26 +0100594#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100595
596#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
597
598/** This kernel performs a reshaping of the output of the depthwise generic convolution.
599 *
600 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
601 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
602 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100603 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100604 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
605 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
606 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
607 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
608 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
609 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
610 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
611 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
612 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
613 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
614 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
615 */
616__kernel void depthwise_vector_to_tensor(
617 VECTOR_DECLARATION(src),
618 TENSOR3D_DECLARATION(dst))
619{
620 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
621
622 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
623 const int id0 = get_global_id(0);
624 const int z = id0 / patch_size;
625 const int index2D = id0 - z * patch_size;
626
Giorgio Arenad051e972018-06-20 11:46:42 +0100627 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100628 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
629}
630
631#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000632
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100633#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000634#if defined(CONV_STRIDE_X)
635#if CONV_STRIDE_X == 1
636#define convolution1x3_f16 convolution1x3_stride_1_f16
637#elif CONV_STRIDE_X == 2
638#define convolution1x3_f16 convolution1x3_stride_2_f16
639#elif CONV_STRIDE_X == 3
640#define convolution1x3_f16 convolution1x3_stride_3_f16
641#else /* CONV_STRIDE_X */
642#error "Stride not supported"
643#endif /* CONV_STRIDE_X */
644
645/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
646 *
647 * @param[in] left_pixel Pointer to the left pixel.
648 * @param[in] left_coeff Weight of the left pixel
649 * @param[in] middle_coeff Weight of the middle pixel
650 * @param[in] right_coeff Weight of the right pixel
651 *
652 * @return a half4 containing 4 convoluted values.
653 */
654inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
655 const half left_coeff,
656 const half middle_coeff,
657 const half right_coeff)
658{
659 half8 temp = vload8(0, (__global half *)left_pixel);
660
661 half4 left = CONVERT(temp.s0123, half4);
662 half4 middle = CONVERT(temp.s1234, half4);
663 half4 right = CONVERT(temp.s2345, half4);
664
665 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
666}
667
668/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
669 *
670 * @param[in] left_pixel Pointer to the left pixel.
671 * @param[in] left_coeff Weight of the left pixel
672 * @param[in] middle_coeff Weight of the middle pixel
673 * @param[in] right_coeff Weight of the right pixel
674 *
675 * @return a half4 containing 4 convoluted values.
676 */
677inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
678 const half left_coeff,
679 const half middle_coeff,
680 const half right_coeff)
681{
682 half8 temp0 = vload8(0, (__global half *)left_pixel);
683 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
684
685 half4 left = CONVERT(temp0.s0246, half4);
686 half4 middle = CONVERT(temp0.s1357, half4);
687 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
688
689 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
690}
691
692/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
693 *
694 * @param[in] left_pixel Pointer to the left pixel.
695 * @param[in] left_coeff Weight of the left pixel
696 * @param[in] middle_coeff Weight of the middle pixel
697 * @param[in] right_coeff Weight of the right pixel
698 *
699 * @return a half4 containing 4 convoluted values.
700 */
701inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
702 const half left_coeff,
703 const half middle_coeff,
704 const half right_coeff)
705{
706 half16 temp0 = vload16(0, (__global half *)left_pixel);
707
708 half4 left = CONVERT(temp0.s0369, half4);
709 half4 middle = CONVERT(temp0.s147A, half4);
710 half4 right = CONVERT(temp0.s258B, half4);
711
712 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
713}
714
715/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
716 *
717 * Convolution matrix layout:
718 *
719 * [ mat0, mat1, mat2 ]\n
720 * [ mat3, mat4, mat5 ]\n
721 * [ mat6, mat7, mat8 ]\n
722 *
723 * @param[in] src A pointer to source Image structure
724 * @param[in] mat0 Coefficient from the convolution matrix
725 * @param[in] mat1 Coefficient from the convolution matrix
726 * @param[in] mat2 Coefficient from the convolution matrix
727 * @param[in] mat3 Coefficient from the convolution matrix
728 * @param[in] mat4 Coefficient from the convolution matrix
729 * @param[in] mat5 Coefficient from the convolution matrix
730 * @param[in] mat6 Coefficient from the convolution matrix
731 * @param[in] mat0 Coefficient from the convolution matrix
732 * @param[in] mat7 Coefficient from the convolution matrix
733 * @param[in] mat8 Coefficient from the convolution matrix
734 *
735 * @return a half4 containing 4 convoluted values.
736 */
737inline half4 convolution3x3_f16(
738 Image *src,
739 const half mat0, const half mat1, const half mat2,
740 const half mat3, const half mat4, const half mat5,
741 const half mat6, const half mat7, const half mat8)
742{
743 half4 pixels;
744
745 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
746 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
747 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
748
749 return pixels;
750}
751
Giorgio Arena76572242018-04-04 17:44:26 +0100752#if defined(DEPTH_MULTIPLIER)
753
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000754/** This OpenCL kernel computes the depthwise convolution 3x3
755 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000756 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
757 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000758 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000759 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000760 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000761 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
762 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000763 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000764 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000765 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
766 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
767 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
768 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
769 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
770 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
771 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000772 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000773 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
774 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
775 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
776 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
777 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
778 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
779 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
780 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
781 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
782 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
783 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
784 */
785__kernel void depthwise_convolution_3x3_f16(
786 TENSOR3D_DECLARATION(src),
787 TENSOR3D_DECLARATION(dst),
788 TENSOR3D_DECLARATION(weights)
789#if defined(HAS_BIAS)
790 ,
791 VECTOR_DECLARATION(biases)
792#endif //defined(HAS_BIAS)
793)
794{
795 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
796 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100797 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000798#if defined(HAS_BIAS)
799 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
800#endif //defined(HAS_BIAS)
801
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100802 // Extract channel and linearized batch indices
803 const int channel = get_global_id(2) % DST_CHANNELS;
804 const int batch = get_global_id(2) / DST_CHANNELS;
805 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
806 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
807 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100808
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000809 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100810 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
811 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
812 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000813
814 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
815 weights_values1.s0, weights_values1.s1, weights_values1.s2,
816 weights_values2.s0, weights_values2.s1, weights_values2.s2);
817#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100818 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000819#endif //defined(HAS_BIAS)
820
821 vstore4(pixels, 0, (__global half *)dst.ptr);
822}
Giorgio Arena76572242018-04-04 17:44:26 +0100823#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000824#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000825
826/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
827 * when both stride_x and stride_y are equal to 1
828 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000829 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
830 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000831 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000832 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000833 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000834 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
835 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000836 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000837 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
838 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
839 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
840 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
841 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
842 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
843 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
844 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
845 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
846 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
847 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
848 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
849 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
850 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
851 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
852 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
853 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
854 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
855 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
856 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
857 */
858__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
859 TENSOR3D_DECLARATION(src),
860 TENSOR3D_DECLARATION(dst),
861 TENSOR3D_DECLARATION(weights)
862#if defined(HAS_BIAS)
863 ,
864 VECTOR_DECLARATION(biases)
865#endif //defined(HAS_BIAS)
866)
867{
868 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
869 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100870 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
871
872 // Extract channel and linearized batch indices
873 const int channel = get_global_id(2) % DST_CHANNELS;
874 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000875
876#ifdef HAS_BIAS
877 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
878
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100879 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000880#endif /* defined(HAS_BIAS) */
881
882 half4 pixels0 = 0.0f;
883 half4 pixels1 = 0.0f;
884 half4 pixels2 = 0.0f;
885 half4 pixels3 = 0.0f;
886
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100887 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
888 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
889 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000890
891 // Load the weights
892 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
893 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
894 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
895
896 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
897 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
898 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
899 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
900 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
901 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
902 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
903
904 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
905 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
906 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
907 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
908 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
909 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
910 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
911 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
912 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
913 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
914 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
915 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
916
917#ifdef HAS_BIAS
918 pixels0 += (half4)bias;
919 pixels1 += (half4)bias;
920 pixels2 += (half4)bias;
921 pixels3 += (half4)bias;
922#endif /* defined(HAS_BIAS) */
923
924 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
925 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
926 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
927 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
928}
929
930/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
931 * when both stride_x and stride_y are equal to 2
932 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000933 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
934 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000935 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000936 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000937 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000938 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000939 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
940 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000941 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
942 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
943 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
944 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
945 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
946 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
947 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
948 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
949 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
950 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
951 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
952 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
953 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
954 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
955 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
956 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
957 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
958 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
959 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
960 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
961 */
962__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
963 TENSOR3D_DECLARATION(src),
964 TENSOR3D_DECLARATION(dst),
965 TENSOR3D_DECLARATION(weights)
966#if defined(HAS_BIAS)
967 ,
968 VECTOR_DECLARATION(biases)
969#endif //defined(HAS_BIAS)
970)
971{
972 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
973 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100974 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
975
976 // Extract channel and linearized batch indices
977 const int channel = get_global_id(2) % DST_CHANNELS;
978 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000979
980#ifdef HAS_BIAS
981 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
982
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100983 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000984#endif /* defined(HAS_BIAS) */
985
986 half4 pixels0 = 0.0f;
987 half4 pixels1 = 0.0f;
988
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100989 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
990 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
991 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000992
993 // Load the weights
994 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
995 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
996 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
997
998 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
999 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1000 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1001 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1002 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1003 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1004 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1005 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1006 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1007 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1008 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1009
1010 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1011 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1012 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1013 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1014 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1015 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1016
1017#ifdef HAS_BIAS
1018 pixels0 += (half4)bias;
1019 pixels1 += (half4)bias;
1020#endif /* defined(HAS_BIAS) */
1021
1022 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1023 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1024}
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001025#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenad051e972018-06-20 11:46:42 +01001026
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001027#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001028
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001029#if DATA_TYPE != float || DATA_TYPE != half
1030#error "Unsupported data type"
1031#endif // DATA_TYPE != float || DATA_TYPE != half
1032
1033#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001034
1035#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1036/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1037 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001038 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001039 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1040 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1041 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1042 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1043 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1044 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1045 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001046 * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
1047 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001048 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001049 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001050 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001051 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001052 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1053 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1054 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1055 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001056 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1057 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1058 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1059 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1060 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1061 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1062 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001063 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1064 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001065 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1066 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1067 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1068 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1069 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1070 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1071 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1072 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1073 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1074 * @param[in] max_offset Max offset for the input tensor
1075 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1076 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1077 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1078 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1079 */
1080__kernel void depthwise_convolution_3x3_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001081 TENSOR4D_DECLARATION(src),
1082 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001083 TENSOR3D_DECLARATION(weights),
1084#if defined(HAS_BIAS)
1085 VECTOR_DECLARATION(biases),
1086#endif /* defined(HAS_BIAS) */
1087 int max_offset)
1088{
1089 int x = get_global_id(0); // channels
1090 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001091#if defined(DST_DEPTH)
1092 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1093 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1094#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001095 int z = get_global_id(2); // spatial coordinate y
Georgios Pinitas37044642018-10-30 14:53:25 +00001096#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001097
1098 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1099
Georgios Pinitas37044642018-10-30 14:53:25 +00001100#if defined(DST_DEPTH)
1101 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1102#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001103 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001104#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001105
1106 int z_coord = 0;
1107 int4 offset = 0;
1108 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
1109
1110 // We compute 2x1x1 [C,W,H] elements
1111 VEC_FLOAT acc = 0;
1112
1113 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001114 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1115 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1116 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1117 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1118 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1119 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1120 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1121 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1122 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001123
1124 // Load input values
1125 // z == 0
1126 // Clamp z_coord as for z = 0, it can be negative
1127 // z_coord is casted to unsigned int in order to use just a min() operation
1128 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1129 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1130 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1131 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001132 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001133
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001134 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1135 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1136 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001137
1138 // z == 1
1139 // z_coord can be only negative for z = 0 so we do not need to clamp it
1140 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
1141 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
1142 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001143 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1144 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1145 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001146
1147 // z == 2
1148 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1149 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1150 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001151 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001152 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1153 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1154 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001155
1156 acc = fma(values0, w0, acc);
1157 acc = fma(values1, w1, acc);
1158 acc = fma(values2, w2, acc);
1159
1160 acc = fma(values3, w3, acc);
1161 acc = fma(values4, w4, acc);
1162 acc = fma(values5, w5, acc);
1163
1164 acc = fma(values6, w6, acc);
1165 acc = fma(values7, w7, acc);
1166 acc = fma(values8, w8, acc);
1167
1168#if defined(HAS_BIAS)
1169 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001170 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001171 acc += bias_values;
1172#endif // defined(HAS_BIAS)
1173
Georgios Pinitas37044642018-10-30 14:53:25 +00001174#if defined(DST_DEPTH)
1175 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1176#else /* defined(DST_DEPTH) */
1177 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1178#endif /* defined(DST_DEPTH) */
1179
Giorgio Arenad051e972018-06-20 11:46:42 +01001180 VSTORE(VEC_SIZE)
Georgios Pinitas37044642018-10-30 14:53:25 +00001181 (acc, 0, (__global DATA_TYPE *)(dst_addr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001182}
1183#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1184
1185#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1186/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1187 *
1188 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1189 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1190 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1191 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1192 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1193 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1194 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001195 * @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32
1196 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001197 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001198 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001199 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001200 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001201 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1202 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1203 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1204 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenad051e972018-06-20 11:46:42 +01001205 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1206 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1207 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1208 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1209 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1210 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1211 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001212 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1213 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenad051e972018-06-20 11:46:42 +01001214 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1215 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1216 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1217 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1218 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1219 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1220 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1221 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1222 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1223 * @param[in] max_offset Max offset for the input tensor
1224 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1225 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1226 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1227 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1228 */
1229__kernel void depthwise_convolution_3x3_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001230 TENSOR4D_DECLARATION(src),
1231 TENSOR4D_DECLARATION(dst),
Giorgio Arenad051e972018-06-20 11:46:42 +01001232 TENSOR3D_DECLARATION(weights),
1233#if defined(HAS_BIAS)
1234 VECTOR_DECLARATION(biases),
1235#endif /* defined(HAS_BIAS) */
1236 int max_offset)
1237{
1238 int x = get_global_id(0); // channels
1239 int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +00001240#if defined(DST_DEPTH)
1241 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1242 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1243#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001244 int z = get_global_id(2); // spatial coordinate y
Georgios Pinitas37044642018-10-30 14:53:25 +00001245#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001246
1247 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1248
Georgios Pinitas37044642018-10-30 14:53:25 +00001249#if defined(DST_DEPTH)
1250 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
1251#else /* defined(DST_DEPTH) */
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001252 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001253#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001254
1255 int z_coord = 0;
1256 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001257 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001258
1259 // We compute 2x2x2 [C,W,H] elements
1260 VEC_FLOAT acc0 = 0;
1261 VEC_FLOAT acc1 = 0;
1262 VEC_FLOAT acc2 = 0;
1263 VEC_FLOAT acc3 = 0;
1264
1265 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001266 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1267 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1268 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1269 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1270 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1271 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1272 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1273 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1274 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001275
1276 // Load input values
1277 // z == 0
1278 // Clamp z_coord as for z = 0, it can be negative
1279 // z_coord is casted to unsigned int in order to use just a min() operation
1280 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001281 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001282 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1283 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001284 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001285
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001286 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1287 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1288 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1289 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001290
1291 // z == 1
1292 // z_coord can be only negative for z = 0 so we do not need to clamp it
1293 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001294 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001295 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001296 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1297 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1298 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1299 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001300
1301 // z == 2
1302 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1303 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1304 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001305 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001306 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1307 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1308 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1309 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001310
1311 // z == 3
1312 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1313 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001314 offset += (int4)src_stride_z;
1315 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001316 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1317 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1318 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1319 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001320
1321 acc0 = fma(values0, w0, acc0);
1322 acc0 = fma(values1, w1, acc0);
1323 acc0 = fma(values2, w2, acc0);
1324 acc1 = fma(values1, w0, acc1);
1325 acc1 = fma(values2, w1, acc1);
1326 acc1 = fma(values3, w2, acc1);
1327
1328 acc0 = fma(values4, w3, acc0);
1329 acc0 = fma(values5, w4, acc0);
1330 acc0 = fma(values6, w5, acc0);
1331 acc1 = fma(values5, w3, acc1);
1332 acc1 = fma(values6, w4, acc1);
1333 acc1 = fma(values7, w5, acc1);
1334
1335 acc0 = fma(values8, w6, acc0);
1336 acc0 = fma(values9, w7, acc0);
1337 acc0 = fma(values10, w8, acc0);
1338 acc1 = fma(values9, w6, acc1);
1339 acc1 = fma(values10, w7, acc1);
1340 acc1 = fma(values11, w8, acc1);
1341
1342 acc2 = fma(values4, w0, acc2);
1343 acc2 = fma(values5, w1, acc2);
1344 acc2 = fma(values6, w2, acc2);
1345 acc3 = fma(values5, w0, acc3);
1346 acc3 = fma(values6, w1, acc3);
1347 acc3 = fma(values7, w2, acc3);
1348
1349 acc2 = fma(values8, w3, acc2);
1350 acc2 = fma(values9, w4, acc2);
1351 acc2 = fma(values10, w5, acc2);
1352 acc3 = fma(values9, w3, acc3);
1353 acc3 = fma(values10, w4, acc3);
1354 acc3 = fma(values11, w5, acc3);
1355
1356 acc2 = fma(values12, w6, acc2);
1357 acc2 = fma(values13, w7, acc2);
1358 acc2 = fma(values14, w8, acc2);
1359 acc3 = fma(values13, w6, acc3);
1360 acc3 = fma(values14, w7, acc3);
1361 acc3 = fma(values15, w8, acc3);
1362
1363#if defined(HAS_BIAS)
1364 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1365
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001366 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001367
1368 acc0 += bias_values;
1369 acc1 += bias_values;
1370 acc2 += bias_values;
1371 acc3 += bias_values;
1372#endif // defined(HAS_BIAS)
1373
Georgios Pinitas37044642018-10-30 14:53:25 +00001374#if defined(DST_DEPTH)
1375 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1376#else /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001377 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001378#endif /* defined(DST_DEPTH) */
Giorgio Arenad051e972018-06-20 11:46:42 +01001379
1380 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001381 (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001382 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001383 (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001384
1385#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1386 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1387#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1388 {
1389 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001390 (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001391 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001392 (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001393 }
1394}
1395
1396#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001397#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)