blob: 23237da562dc9777effe9e6d81af77b4014b672a [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Giorgio Arena76572242018-04-04 17:44:26 +010027#if defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +010028#if defined(CONV_STRIDE_X)
29
Giorgio Arena93a690e2017-08-01 16:09:33 +010030#if CONV_STRIDE_X == 1
31#define convolution1x3 convolution1x3_stride_1
32#elif CONV_STRIDE_X == 2
33#define convolution1x3 convolution1x3_stride_2
34#elif CONV_STRIDE_X == 3
35#define convolution1x3 convolution1x3_stride_3
36#else /* CONV_STRIDE_X */
37#error "Stride not supported"
38#endif /* CONV_STRIDE_X */
39
40/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
41 *
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
46 *
47 * @return a float2 containing 2 convoluted values.
48 */
49inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
53{
54 float4 temp = vload4(0, (__global float *)left_pixel);
55
56 float2 left = CONVERT(temp.s01, float2);
57 float2 middle = CONVERT(temp.s12, float2);
58 float2 right = CONVERT(temp.s23, float2);
59
60 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
61}
62
63/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
64 *
65 * @param[in] left_pixel Pointer to the left pixel.
66 * @param[in] left_coeff Weight of the left pixel
67 * @param[in] middle_coeff Weight of the middle pixel
68 * @param[in] right_coeff Weight of the right pixel
69 *
70 * @return a float2 containing 2 convoluted values.
71 */
72inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
73 const float left_coeff,
74 const float middle_coeff,
75 const float right_coeff)
76{
77 float4 temp0 = vload4(0, (__global float *)left_pixel);
78 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
79
80 float2 left = CONVERT(temp0.s02, float2);
81 float2 middle = CONVERT(temp0.s13, float2);
82 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
83
84 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
85}
86
87/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
88 *
89 * @param[in] left_pixel Pointer to the left pixel.
90 * @param[in] left_coeff Weight of the left pixel
91 * @param[in] middle_coeff Weight of the middle pixel
92 * @param[in] right_coeff Weight of the right pixel
93 *
94 * @return a float2 containing 2 convoluted values.
95 */
96inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
97 const float left_coeff,
98 const float middle_coeff,
99 const float right_coeff)
100{
101 float4 temp0 = vload4(0, (__global float *)left_pixel);
102 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
103
104 float2 left = CONVERT(temp0.s03, float2);
105 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
106 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
107
108 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
109}
110
111/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
112 *
113 * Convolution matrix layout:
114 *
115 * [ mat0, mat1, mat2 ]\n
116 * [ mat3, mat4, mat5 ]\n
117 * [ mat6, mat7, mat8 ]\n
118 *
119 * @param[in] src A pointer to source Image structure
120 * @param[in] mat0 Coefficient from the convolution matrix
121 * @param[in] mat1 Coefficient from the convolution matrix
122 * @param[in] mat2 Coefficient from the convolution matrix
123 * @param[in] mat3 Coefficient from the convolution matrix
124 * @param[in] mat4 Coefficient from the convolution matrix
125 * @param[in] mat5 Coefficient from the convolution matrix
126 * @param[in] mat6 Coefficient from the convolution matrix
127 * @param[in] mat0 Coefficient from the convolution matrix
128 * @param[in] mat7 Coefficient from the convolution matrix
129 * @param[in] mat8 Coefficient from the convolution matrix
130 *
131 * @return a float2 containing 2 convoluted values.
132 */
133inline float2 convolution3x3(
134 Image *src,
135 const float mat0, const float mat1, const float mat2,
136 const float mat3, const float mat4, const float mat5,
137 const float mat6, const float mat7, const float mat8)
138{
139 float2 pixels;
140
141 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
142 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
143 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
144
145 return pixels;
146}
147
Gian Marcoc799ed82018-02-01 16:57:48 +0000148/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000149 *
Gian Marcoc799ed82018-02-01 16:57:48 +0000150 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000151 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
153 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000166 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000167 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
168 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
170 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
172 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
173 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
178 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100179__kernel void depthwise_convolution_3x3(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(weights)
183#if defined(HAS_BIAS)
184 ,
185 VECTOR_DECLARATION(biases)
186#endif //defined(HAS_BIAS)
187)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100188{
189 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
190 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
191 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100192#if defined(HAS_BIAS)
193 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
194#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100195
Giorgio Arena76572242018-04-04 17:44:26 +0100196 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
197
Giorgio Arena93a690e2017-08-01 16:09:33 +0100198 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
199 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
200 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
201 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
202
203 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
204 weights_values1.s0, weights_values1.s1, weights_values1.s2,
205 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100206#if defined(HAS_BIAS)
207 pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
208#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100209
210 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100211}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100212#endif //defined(CONV_STRIDE_X)
213
Gian Marcoc799ed82018-02-01 16:57:48 +0000214#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
215 ({ \
216 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
217 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
218 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
219 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
220 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
221 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
222 })
223
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000224#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
225 ({ \
226 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
227 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
228 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
229 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
230 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
231 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
232 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
233 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
234 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
235 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
236 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
237 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
238 })
239
Gian Marcoc799ed82018-02-01 16:57:48 +0000240#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
241 ({ \
242 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
243 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
244 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
245 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
246 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
247 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
248 })
249
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000250#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
251 ({ \
252 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
253 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
254 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
255 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
256 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
257 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
258 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
259 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
260 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
261 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
262 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
263 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
264 })
265
Gian Marcoc799ed82018-02-01 16:57:48 +0000266/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
267 * stride_x and stride_y are equal to 1
268 *
269 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
270 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
271 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
272 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
273 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
274 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
275 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
276 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
277 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
283 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
285 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
286 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
287 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
289 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
291 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
292 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
293 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
294 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
295 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
297 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000298__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000299 TENSOR3D_DECLARATION(src),
300 TENSOR3D_DECLARATION(dst),
301 TENSOR3D_DECLARATION(weights)
302#if defined(HAS_BIAS)
303 ,
304 VECTOR_DECLARATION(biases)
305#endif //defined(HAS_BIAS)
306)
307{
308 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
309 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
310 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
311
312 float2 pixels0 = 0.0f;
313 float2 pixels1 = 0.0f;
314 float2 pixels2 = 0.0f;
315 float2 pixels3 = 0.0f;
316
317 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100318 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000319
320 // Load the weights
321 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
322 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
323 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
324
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000325 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000326 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
327 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
328 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
329 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000330 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
331 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000332
333 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
334 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
335 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
336 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
337 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
338 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
339 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
340 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
341 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
342 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
343 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
344 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
345
346#ifdef HAS_BIAS
347 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
348
349 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
350
351 pixels0 += (float2)bias;
352 pixels1 += (float2)bias;
353 pixels2 += (float2)bias;
354 pixels3 += (float2)bias;
355#endif /* defined(HAS_BIAS) */
356
357 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
358 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
359 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
360 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
361}
362
363/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
364 * stride_x and stride_y are equal to 2
365 *
366 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
367 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
368 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
369 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
370 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
371 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
372 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
373 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
374 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
375 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
376 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
377 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
378 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
379 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
380 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
381 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
382 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
383 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
384 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
385 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
386 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
387 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
388 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
389 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
390 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
391 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
392 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
393 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
394 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000395__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000396 TENSOR3D_DECLARATION(src),
397 TENSOR3D_DECLARATION(dst),
398 TENSOR3D_DECLARATION(weights)
399#if defined(HAS_BIAS)
400 ,
401 VECTOR_DECLARATION(biases)
402#endif //defined(HAS_BIAS)
403)
404{
405 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
406 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
407 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
408
409 float2 pixels0 = 0.0f;
410 float2 pixels1 = 0.0f;
411
412 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100413 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000414
415 // Load the weights
416 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
417 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
418 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
419
420 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
421 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
422 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
423 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
424 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
425 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
426 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
427 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
428 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
429 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
430 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
431
432 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
433 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
434 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
435 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
436 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
437 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
438
439#ifdef HAS_BIAS
440 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
441
442 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
443
444 pixels0 += (float2)bias;
445 pixels1 += (float2)bias;
446#endif /* defined(HAS_BIAS) */
447
448 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
449 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
450}
451
Giorgio Arena76572242018-04-04 17:44:26 +0100452#endif // defined(DEPTH_MULTIPLIER)
453
Giorgio Arenad051e972018-06-20 11:46:42 +0100454#if defined(NCHW)
455#define in_stride_x src_stride_x
456#define in_stride_y src_stride_y
457#define in_stride_z src_stride_z
458#define out_stride_x dst_stride_x
459#define out_stride_y dst_stride_y
460#define out_stride_z dst_stride_z
461#else //defined(NCHW)
462#define in_stride_x src_stride_y
463#define in_stride_y src_stride_z
464#define in_stride_z src_stride_x
465#define out_stride_x dst_stride_y
466#define out_stride_y dst_stride_z
467#define out_stride_z dst_stride_x
468#endif //defined(NCHW)
469
Giorgio Arena9fe41442017-08-23 16:36:24 +0100470#if defined(SRC_WIDTH) && defined(DATA_TYPE)
471/** This kernel reshapes each of the tensor's low three dimensions to single rows.
472 *
473 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
474 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100475 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
476 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
477 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
478 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
479 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
480 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
481 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
482 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
483 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
484 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
485 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
486 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
487 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
488 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
489 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
490 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
491 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100493 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100494__kernel void depthwise_weights_reshape(
495 TENSOR3D_DECLARATION(src),
496 IMAGE_DECLARATION(dst)
497#ifdef HAS_BIAS
498 ,
499 VECTOR_DECLARATION(biases)
500#endif /* HAS_BIAS */
501)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100502{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100503#ifdef HAS_BIAS
504 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
505#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100506
Giorgio Arenad051e972018-06-20 11:46:42 +0100507 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
508 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100509
Giorgio Arenad051e972018-06-20 11:46:42 +0100510 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100511 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100512 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100513 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100514
515#if defined(HAS_BIAS)
516 if(get_global_id(1) == 0)
517 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100518 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100519 }
520#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100521}
522#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
523
Giorgio Arena76572242018-04-04 17:44:26 +0100524#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100525/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
526 *
527 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100528 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Giorgio Arena9fe41442017-08-23 16:36:24 +0100529 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100530 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100531 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
532 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
533 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
534 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
535 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
536 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
537 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
538 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
539 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
540 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
541 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
542 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
543 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
544 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
545 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
546 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100547__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
548{
549 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
550
551 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100552 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100553 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
554
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100555 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
556 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100557 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100558
Giorgio Arenad051e972018-06-20 11:46:42 +0100559 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100560 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
561
562 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
563 {
564 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
565 {
566 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
567 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000568 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100569 }
570 else
571 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100572 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100573 }
574 }
575 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100576#if defined(HAS_BIAS)
577 *output_ptr = (DATA_TYPE)(1);
578#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100579}
580
Giorgio Arena76572242018-04-04 17:44:26 +0100581#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100582
583#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
584
585/** This kernel performs a reshaping of the output of the depthwise generic convolution.
586 *
587 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
588 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
589 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100590 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100591 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
592 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
593 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
594 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
595 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
596 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
597 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
598 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
599 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
600 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
601 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
602 */
603__kernel void depthwise_vector_to_tensor(
604 VECTOR_DECLARATION(src),
605 TENSOR3D_DECLARATION(dst))
606{
607 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
608
609 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
610 const int id0 = get_global_id(0);
611 const int z = id0 / patch_size;
612 const int index2D = id0 - z * patch_size;
613
Giorgio Arenad051e972018-06-20 11:46:42 +0100614 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100615 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
616}
617
618#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000619
Giorgio Arena76572242018-04-04 17:44:26 +0100620#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000621#if defined(CONV_STRIDE_X)
622#if CONV_STRIDE_X == 1
623#define convolution1x3_f16 convolution1x3_stride_1_f16
624#elif CONV_STRIDE_X == 2
625#define convolution1x3_f16 convolution1x3_stride_2_f16
626#elif CONV_STRIDE_X == 3
627#define convolution1x3_f16 convolution1x3_stride_3_f16
628#else /* CONV_STRIDE_X */
629#error "Stride not supported"
630#endif /* CONV_STRIDE_X */
631
632/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
633 *
634 * @param[in] left_pixel Pointer to the left pixel.
635 * @param[in] left_coeff Weight of the left pixel
636 * @param[in] middle_coeff Weight of the middle pixel
637 * @param[in] right_coeff Weight of the right pixel
638 *
639 * @return a half4 containing 4 convoluted values.
640 */
641inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
642 const half left_coeff,
643 const half middle_coeff,
644 const half right_coeff)
645{
646 half8 temp = vload8(0, (__global half *)left_pixel);
647
648 half4 left = CONVERT(temp.s0123, half4);
649 half4 middle = CONVERT(temp.s1234, half4);
650 half4 right = CONVERT(temp.s2345, half4);
651
652 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
653}
654
655/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
656 *
657 * @param[in] left_pixel Pointer to the left pixel.
658 * @param[in] left_coeff Weight of the left pixel
659 * @param[in] middle_coeff Weight of the middle pixel
660 * @param[in] right_coeff Weight of the right pixel
661 *
662 * @return a half4 containing 4 convoluted values.
663 */
664inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
665 const half left_coeff,
666 const half middle_coeff,
667 const half right_coeff)
668{
669 half8 temp0 = vload8(0, (__global half *)left_pixel);
670 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
671
672 half4 left = CONVERT(temp0.s0246, half4);
673 half4 middle = CONVERT(temp0.s1357, half4);
674 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
675
676 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
677}
678
679/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
680 *
681 * @param[in] left_pixel Pointer to the left pixel.
682 * @param[in] left_coeff Weight of the left pixel
683 * @param[in] middle_coeff Weight of the middle pixel
684 * @param[in] right_coeff Weight of the right pixel
685 *
686 * @return a half4 containing 4 convoluted values.
687 */
688inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
689 const half left_coeff,
690 const half middle_coeff,
691 const half right_coeff)
692{
693 half16 temp0 = vload16(0, (__global half *)left_pixel);
694
695 half4 left = CONVERT(temp0.s0369, half4);
696 half4 middle = CONVERT(temp0.s147A, half4);
697 half4 right = CONVERT(temp0.s258B, half4);
698
699 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
700}
701
702/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
703 *
704 * Convolution matrix layout:
705 *
706 * [ mat0, mat1, mat2 ]\n
707 * [ mat3, mat4, mat5 ]\n
708 * [ mat6, mat7, mat8 ]\n
709 *
710 * @param[in] src A pointer to source Image structure
711 * @param[in] mat0 Coefficient from the convolution matrix
712 * @param[in] mat1 Coefficient from the convolution matrix
713 * @param[in] mat2 Coefficient from the convolution matrix
714 * @param[in] mat3 Coefficient from the convolution matrix
715 * @param[in] mat4 Coefficient from the convolution matrix
716 * @param[in] mat5 Coefficient from the convolution matrix
717 * @param[in] mat6 Coefficient from the convolution matrix
718 * @param[in] mat0 Coefficient from the convolution matrix
719 * @param[in] mat7 Coefficient from the convolution matrix
720 * @param[in] mat8 Coefficient from the convolution matrix
721 *
722 * @return a half4 containing 4 convoluted values.
723 */
724inline half4 convolution3x3_f16(
725 Image *src,
726 const half mat0, const half mat1, const half mat2,
727 const half mat3, const half mat4, const half mat5,
728 const half mat6, const half mat7, const half mat8)
729{
730 half4 pixels;
731
732 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
733 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
734 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
735
736 return pixels;
737}
738
Giorgio Arena76572242018-04-04 17:44:26 +0100739#if defined(DEPTH_MULTIPLIER)
740
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000741/** This OpenCL kernel computes the depthwise convolution 3x3
742 *
743 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
744 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
745 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
746 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
747 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
748 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
749 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
750 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000751 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000752 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
753 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
754 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
755 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
756 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
757 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
758 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000759 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000760 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
761 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
762 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
763 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
764 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
765 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
766 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
767 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
768 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
769 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
770 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
771 */
772__kernel void depthwise_convolution_3x3_f16(
773 TENSOR3D_DECLARATION(src),
774 TENSOR3D_DECLARATION(dst),
775 TENSOR3D_DECLARATION(weights)
776#if defined(HAS_BIAS)
777 ,
778 VECTOR_DECLARATION(biases)
779#endif //defined(HAS_BIAS)
780)
781{
782 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
783 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
784 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
785#if defined(HAS_BIAS)
786 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
787#endif //defined(HAS_BIAS)
788
Giorgio Arena76572242018-04-04 17:44:26 +0100789 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
790
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000791 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
792 half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
793 half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
794 half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
795
796 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
797 weights_values1.s0, weights_values1.s1, weights_values1.s2,
798 weights_values2.s0, weights_values2.s1, weights_values2.s2);
799#if defined(HAS_BIAS)
800 pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
801#endif //defined(HAS_BIAS)
802
803 vstore4(pixels, 0, (__global half *)dst.ptr);
804}
Giorgio Arena76572242018-04-04 17:44:26 +0100805#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000806#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000807
808/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
809 * when both stride_x and stride_y are equal to 1
810 *
811 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
812 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
813 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
814 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
815 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
816 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
817 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
818 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
819 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
820 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
821 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
822 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
823 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
824 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
825 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
826 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
827 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
828 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
829 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
830 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
831 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
832 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
833 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
834 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
835 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
836 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
837 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
838 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
839 */
840__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
841 TENSOR3D_DECLARATION(src),
842 TENSOR3D_DECLARATION(dst),
843 TENSOR3D_DECLARATION(weights)
844#if defined(HAS_BIAS)
845 ,
846 VECTOR_DECLARATION(biases)
847#endif //defined(HAS_BIAS)
848)
849{
850 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
851 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
852 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
853
854#ifdef HAS_BIAS
855 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
856
857 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
858#endif /* defined(HAS_BIAS) */
859
860 half4 pixels0 = 0.0f;
861 half4 pixels1 = 0.0f;
862 half4 pixels2 = 0.0f;
863 half4 pixels3 = 0.0f;
864
865 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100866 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000867
868 // Load the weights
869 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
870 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
871 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
872
873 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
874 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
875 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
876 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
877 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
878 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
879 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
880
881 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
882 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
883 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
884 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
885 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
886 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
887 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
888 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
889 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
890 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
891 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
892 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
893
894#ifdef HAS_BIAS
895 pixels0 += (half4)bias;
896 pixels1 += (half4)bias;
897 pixels2 += (half4)bias;
898 pixels3 += (half4)bias;
899#endif /* defined(HAS_BIAS) */
900
901 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
902 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
903 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
904 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
905}
906
907/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
908 * when both stride_x and stride_y are equal to 2
909 *
910 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
911 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
912 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
913 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
914 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
915 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
916 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
917 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
918 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
919 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
920 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
921 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
922 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
923 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
924 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
925 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
926 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
927 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
928 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
929 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
930 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
931 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
932 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
933 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
934 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
935 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
936 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
937 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
938 */
939__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
940 TENSOR3D_DECLARATION(src),
941 TENSOR3D_DECLARATION(dst),
942 TENSOR3D_DECLARATION(weights)
943#if defined(HAS_BIAS)
944 ,
945 VECTOR_DECLARATION(biases)
946#endif //defined(HAS_BIAS)
947)
948{
949 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
950 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
951 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
952
953#ifdef HAS_BIAS
954 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
955
956 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
957#endif /* defined(HAS_BIAS) */
958
959 half4 pixels0 = 0.0f;
960 half4 pixels1 = 0.0f;
961
962 __global uchar *weights_addr = (__global uchar *)weights.ptr;
Giorgio Arena76572242018-04-04 17:44:26 +0100963 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000964
965 // Load the weights
966 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
967 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
968 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
969
970 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
971 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
972 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
973 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
974 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
975 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
976 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
977 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
978 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
979 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
980 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
981
982 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
983 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
984 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
985 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
986 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
987 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
988
989#ifdef HAS_BIAS
990 pixels0 += (half4)bias;
991 pixels1 += (half4)bias;
992#endif /* defined(HAS_BIAS) */
993
994 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
995 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
996}
Giorgio Arena76572242018-04-04 17:44:26 +0100997#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
Giorgio Arenad051e972018-06-20 11:46:42 +0100998
Giorgio Arenae6bb3c62018-08-23 11:19:11 +0100999#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001000
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001001#if DATA_TYPE != float || DATA_TYPE != half
1002#error "Unsupported data type"
1003#endif // DATA_TYPE != float || DATA_TYPE != half
1004
1005#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001006
1007#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1008/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1009 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001010 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001011 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1012 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1013 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1014 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1015 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1016 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1017 *
1018 * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
1019 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1020 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1021 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1022 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1023 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1024 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1025 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1026 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1027 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1028 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1029 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1030 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1031 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1032 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1033 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1034 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1035 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1036 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1037 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1038 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1039 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1040 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1041 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1042 * @param[in] max_offset Max offset for the input tensor
1043 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1044 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1045 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1046 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1047 */
1048__kernel void depthwise_convolution_3x3_nhwc(
1049 TENSOR3D_DECLARATION(src),
1050 TENSOR3D_DECLARATION(dst),
1051 TENSOR3D_DECLARATION(weights),
1052#if defined(HAS_BIAS)
1053 VECTOR_DECLARATION(biases),
1054#endif /* defined(HAS_BIAS) */
1055 int max_offset)
1056{
1057 int x = get_global_id(0); // channels
1058 int y = get_global_id(1); // spatial coordinate x
1059 int z = get_global_id(2); // spatial coordinate y
1060
1061 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1062
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001063 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Giorgio Arenad051e972018-06-20 11:46:42 +01001064
1065 int z_coord = 0;
1066 int4 offset = 0;
1067 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
1068
1069 // We compute 2x1x1 [C,W,H] elements
1070 VEC_FLOAT acc = 0;
1071
1072 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001073 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1074 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1075 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1076 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1077 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1078 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1079 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1080 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1081 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001082
1083 // Load input values
1084 // z == 0
1085 // Clamp z_coord as for z = 0, it can be negative
1086 // z_coord is casted to unsigned int in order to use just a min() operation
1087 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1088 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1089 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1090 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001091 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001092
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001093 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1094 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1095 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001096
1097 // z == 1
1098 // z_coord can be only negative for z = 0 so we do not need to clamp it
1099 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
1100 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
1101 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001102 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1103 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1104 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001105
1106 // z == 2
1107 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1108 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1109 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001110 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001111 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1112 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1113 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001114
1115 acc = fma(values0, w0, acc);
1116 acc = fma(values1, w1, acc);
1117 acc = fma(values2, w2, acc);
1118
1119 acc = fma(values3, w3, acc);
1120 acc = fma(values4, w4, acc);
1121 acc = fma(values5, w5, acc);
1122
1123 acc = fma(values6, w6, acc);
1124 acc = fma(values7, w7, acc);
1125 acc = fma(values8, w8, acc);
1126
1127#if defined(HAS_BIAS)
1128 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001129 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001130 acc += bias_values;
1131#endif // defined(HAS_BIAS)
1132
1133 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
1134 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001135 (acc, 0, (__global DATA_TYPE *)(dst.ptr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001136}
1137#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1138
1139#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1140/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1141 *
1142 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1143 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1144 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1145 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1146 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1147 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1148 *
1149 * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
1150 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1151 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1152 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1153 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1154 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1155 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1156 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1157 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1158 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1159 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1160 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1161 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1162 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1163 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1164 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1165 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1166 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1167 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1168 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1169 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1170 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1171 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1172 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1173 * @param[in] max_offset Max offset for the input tensor
1174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1178 */
1179__kernel void depthwise_convolution_3x3_nhwc_stride1(
1180 TENSOR3D_DECLARATION(src),
1181 TENSOR3D_DECLARATION(dst),
1182 TENSOR3D_DECLARATION(weights),
1183#if defined(HAS_BIAS)
1184 VECTOR_DECLARATION(biases),
1185#endif /* defined(HAS_BIAS) */
1186 int max_offset)
1187{
1188 int x = get_global_id(0); // channels
1189 int y = get_global_id(1); // spatial coordinate x
1190 int z = get_global_id(2); // spatial coordinate y
1191
1192 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1193
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001194 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Giorgio Arenad051e972018-06-20 11:46:42 +01001195
1196 int z_coord = 0;
1197 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001198 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001199
1200 // We compute 2x2x2 [C,W,H] elements
1201 VEC_FLOAT acc0 = 0;
1202 VEC_FLOAT acc1 = 0;
1203 VEC_FLOAT acc2 = 0;
1204 VEC_FLOAT acc3 = 0;
1205
1206 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001207 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1208 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1209 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1210 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1211 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1212 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1213 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1214 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1215 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001216
1217 // Load input values
1218 // z == 0
1219 // Clamp z_coord as for z = 0, it can be negative
1220 // z_coord is casted to unsigned int in order to use just a min() operation
1221 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001222 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001223 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1224 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001225 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001226
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001227 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1228 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1229 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1230 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001231
1232 // z == 1
1233 // z_coord can be only negative for z = 0 so we do not need to clamp it
1234 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001235 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001236 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001237 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1238 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1239 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1240 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001241
1242 // z == 2
1243 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1244 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1245 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001246 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001247 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1248 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1249 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1250 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001251
1252 // z == 3
1253 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1254 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001255 offset += (int4)src_stride_z;
1256 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001257 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1258 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1259 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1260 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001261
1262 acc0 = fma(values0, w0, acc0);
1263 acc0 = fma(values1, w1, acc0);
1264 acc0 = fma(values2, w2, acc0);
1265 acc1 = fma(values1, w0, acc1);
1266 acc1 = fma(values2, w1, acc1);
1267 acc1 = fma(values3, w2, acc1);
1268
1269 acc0 = fma(values4, w3, acc0);
1270 acc0 = fma(values5, w4, acc0);
1271 acc0 = fma(values6, w5, acc0);
1272 acc1 = fma(values5, w3, acc1);
1273 acc1 = fma(values6, w4, acc1);
1274 acc1 = fma(values7, w5, acc1);
1275
1276 acc0 = fma(values8, w6, acc0);
1277 acc0 = fma(values9, w7, acc0);
1278 acc0 = fma(values10, w8, acc0);
1279 acc1 = fma(values9, w6, acc1);
1280 acc1 = fma(values10, w7, acc1);
1281 acc1 = fma(values11, w8, acc1);
1282
1283 acc2 = fma(values4, w0, acc2);
1284 acc2 = fma(values5, w1, acc2);
1285 acc2 = fma(values6, w2, acc2);
1286 acc3 = fma(values5, w0, acc3);
1287 acc3 = fma(values6, w1, acc3);
1288 acc3 = fma(values7, w2, acc3);
1289
1290 acc2 = fma(values8, w3, acc2);
1291 acc2 = fma(values9, w4, acc2);
1292 acc2 = fma(values10, w5, acc2);
1293 acc3 = fma(values9, w3, acc3);
1294 acc3 = fma(values10, w4, acc3);
1295 acc3 = fma(values11, w5, acc3);
1296
1297 acc2 = fma(values12, w6, acc2);
1298 acc2 = fma(values13, w7, acc2);
1299 acc2 = fma(values14, w8, acc2);
1300 acc3 = fma(values13, w6, acc3);
1301 acc3 = fma(values14, w7, acc3);
1302 acc3 = fma(values15, w8, acc3);
1303
1304#if defined(HAS_BIAS)
1305 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1306
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001307 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001308
1309 acc0 += bias_values;
1310 acc1 += bias_values;
1311 acc2 += bias_values;
1312 acc3 += bias_values;
1313#endif // defined(HAS_BIAS)
1314
1315 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
1316
1317 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001318 (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001319 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001320 (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001321
1322#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1323 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1324#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1325 {
1326 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001327 (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001328 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001329 (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001330 }
1331}
1332
1333#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001334#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)