blob: 97b46c47cfc235bb9da68536adefd93a3724c35c [file] [log] [blame]
Giorgio Arena93a690e2017-08-01 16:09:33 +01001/*
Gian Marcoc799ed82018-02-01 16:57:48 +00002 * Copyright (c) 2017-2018 ARM Limited.
Giorgio Arena93a690e2017-08-01 16:09:33 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers.h"
26
Georgios Pinitase55b40a2018-09-13 17:20:04 +010027#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena9fe41442017-08-23 16:36:24 +010028#if defined(CONV_STRIDE_X)
29
Giorgio Arena93a690e2017-08-01 16:09:33 +010030#if CONV_STRIDE_X == 1
31#define convolution1x3 convolution1x3_stride_1
32#elif CONV_STRIDE_X == 2
33#define convolution1x3 convolution1x3_stride_2
34#elif CONV_STRIDE_X == 3
35#define convolution1x3 convolution1x3_stride_3
36#else /* CONV_STRIDE_X */
37#error "Stride not supported"
38#endif /* CONV_STRIDE_X */
39
40/** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
41 *
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
46 *
47 * @return a float2 containing 2 convoluted values.
48 */
49inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
53{
54 float4 temp = vload4(0, (__global float *)left_pixel);
55
56 float2 left = CONVERT(temp.s01, float2);
57 float2 middle = CONVERT(temp.s12, float2);
58 float2 right = CONVERT(temp.s23, float2);
59
60 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
61}
62
63/** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
64 *
65 * @param[in] left_pixel Pointer to the left pixel.
66 * @param[in] left_coeff Weight of the left pixel
67 * @param[in] middle_coeff Weight of the middle pixel
68 * @param[in] right_coeff Weight of the right pixel
69 *
70 * @return a float2 containing 2 convoluted values.
71 */
72inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
73 const float left_coeff,
74 const float middle_coeff,
75 const float right_coeff)
76{
77 float4 temp0 = vload4(0, (__global float *)left_pixel);
78 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
79
80 float2 left = CONVERT(temp0.s02, float2);
81 float2 middle = CONVERT(temp0.s13, float2);
82 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
83
84 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
85}
86
87/** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
88 *
89 * @param[in] left_pixel Pointer to the left pixel.
90 * @param[in] left_coeff Weight of the left pixel
91 * @param[in] middle_coeff Weight of the middle pixel
92 * @param[in] right_coeff Weight of the right pixel
93 *
94 * @return a float2 containing 2 convoluted values.
95 */
96inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
97 const float left_coeff,
98 const float middle_coeff,
99 const float right_coeff)
100{
101 float4 temp0 = vload4(0, (__global float *)left_pixel);
102 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
103
104 float2 left = CONVERT(temp0.s03, float2);
105 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
106 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
107
108 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
109}
110
111/** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
112 *
113 * Convolution matrix layout:
114 *
115 * [ mat0, mat1, mat2 ]\n
116 * [ mat3, mat4, mat5 ]\n
117 * [ mat6, mat7, mat8 ]\n
118 *
119 * @param[in] src A pointer to source Image structure
120 * @param[in] mat0 Coefficient from the convolution matrix
121 * @param[in] mat1 Coefficient from the convolution matrix
122 * @param[in] mat2 Coefficient from the convolution matrix
123 * @param[in] mat3 Coefficient from the convolution matrix
124 * @param[in] mat4 Coefficient from the convolution matrix
125 * @param[in] mat5 Coefficient from the convolution matrix
126 * @param[in] mat6 Coefficient from the convolution matrix
127 * @param[in] mat0 Coefficient from the convolution matrix
128 * @param[in] mat7 Coefficient from the convolution matrix
129 * @param[in] mat8 Coefficient from the convolution matrix
130 *
131 * @return a float2 containing 2 convoluted values.
132 */
133inline float2 convolution3x3(
134 Image *src,
135 const float mat0, const float mat1, const float mat2,
136 const float mat3, const float mat4, const float mat5,
137 const float mat6, const float mat7, const float mat8)
138{
139 float2 pixels;
140
141 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
142 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
143 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
144
145 return pixels;
146}
147
Gian Marcoc799ed82018-02-01 16:57:48 +0000148/** This OpenCL kernel computes the depthwise convolution 3x3
Anthony Barbierf202e502017-11-23 18:02:04 +0000149 *
Gian Marcoc799ed82018-02-01 16:57:48 +0000150 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000151 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
153 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Gian Marcoc799ed82018-02-01 16:57:48 +0000158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000166 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
Anthony Barbierf202e502017-11-23 18:02:04 +0000167 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
168 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
170 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
172 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
173 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
178 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100179__kernel void depthwise_convolution_3x3(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(weights)
183#if defined(HAS_BIAS)
184 ,
185 VECTOR_DECLARATION(biases)
186#endif //defined(HAS_BIAS)
187)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100188{
189 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
190 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100191 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100192#if defined(HAS_BIAS)
193 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
194#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100195
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100196 // Extract channel and linearized batch indices
197 const int channel = get_global_id(2) % DST_CHANNELS;
198 const int batch = get_global_id(2) / DST_CHANNELS;
199 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
200 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
201 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100202
Giorgio Arena93a690e2017-08-01 16:09:33 +0100203 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100204 float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
205 float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
206 float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
Giorgio Arena93a690e2017-08-01 16:09:33 +0100207
208 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
209 weights_values1.s0, weights_values1.s1, weights_values1.s2,
210 weights_values2.s0, weights_values2.s1, weights_values2.s2);
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100211#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100212 pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100213#endif //defined(HAS_BIAS)
Giorgio Arena93a690e2017-08-01 16:09:33 +0100214
215 vstore2(pixels, 0, (__global float *)dst.ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100216}
Giorgio Arena9fe41442017-08-23 16:36:24 +0100217#endif //defined(CONV_STRIDE_X)
218
Gian Marcoc799ed82018-02-01 16:57:48 +0000219#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
220 ({ \
221 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
222 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
223 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
224 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
225 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
226 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
227 })
228
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000229#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
230 ({ \
231 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
232 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
233 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
234 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
235 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
236 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
237 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
238 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
239 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
240 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
241 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
242 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
243 })
244
Gian Marcoc799ed82018-02-01 16:57:48 +0000245#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
246 ({ \
247 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
248 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
249 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
250 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
251 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
252 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
253 })
254
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000255#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
256 ({ \
257 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
258 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
259 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
260 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
261 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
262 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
263 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
264 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
265 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
266 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
267 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
268 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
269 })
270
Gian Marcoc799ed82018-02-01 16:57:48 +0000271/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
272 * stride_x and stride_y are equal to 1
273 *
274 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
275 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
276 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
277 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
278 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
279 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
280 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
281 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
283 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
284 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
285 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
286 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
287 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
288 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
289 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
290 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
291 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
292 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
293 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
294 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
295 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
296 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
297 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
298 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
299 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
300 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
301 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
302 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000303__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000304 TENSOR3D_DECLARATION(src),
305 TENSOR3D_DECLARATION(dst),
306 TENSOR3D_DECLARATION(weights)
307#if defined(HAS_BIAS)
308 ,
309 VECTOR_DECLARATION(biases)
310#endif //defined(HAS_BIAS)
311)
312{
313 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
314 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100315 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000316
317 float2 pixels0 = 0.0f;
318 float2 pixels1 = 0.0f;
319 float2 pixels2 = 0.0f;
320 float2 pixels3 = 0.0f;
321
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100322 // Extract channel and linearized batch indices
323 const int channel = get_global_id(2) % DST_CHANNELS;
324 const int batch = get_global_id(2) / DST_CHANNELS;
325 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
326 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
327 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000328
329 // Load the weights
330 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
331 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
332 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
333
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000334 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
Gian Marcoc799ed82018-02-01 16:57:48 +0000335 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
336 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
337 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
338 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000339 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
340 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
Gian Marcoc799ed82018-02-01 16:57:48 +0000341
342 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
343 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
344 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
345 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
346 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
347 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
348 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
349 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
350 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
351 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
352 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
353 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
354
355#ifdef HAS_BIAS
356 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
357
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100358 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000359
360 pixels0 += (float2)bias;
361 pixels1 += (float2)bias;
362 pixels2 += (float2)bias;
363 pixels3 += (float2)bias;
364#endif /* defined(HAS_BIAS) */
365
366 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
367 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
368 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
369 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
370}
371
372/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
373 * stride_x and stride_y are equal to 2
374 *
375 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
376 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
377 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
378 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
379 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
380 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
381 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
382 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
383 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
384 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
385 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
386 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
387 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
388 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
389 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
390 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
391 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
392 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
393 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
394 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
395 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
396 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
397 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
398 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
399 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
400 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
401 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
402 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
403 */
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000404__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
Gian Marcoc799ed82018-02-01 16:57:48 +0000405 TENSOR3D_DECLARATION(src),
406 TENSOR3D_DECLARATION(dst),
407 TENSOR3D_DECLARATION(weights)
408#if defined(HAS_BIAS)
409 ,
410 VECTOR_DECLARATION(biases)
411#endif //defined(HAS_BIAS)
412)
413{
414 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
415 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100416 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Gian Marcoc799ed82018-02-01 16:57:48 +0000417
418 float2 pixels0 = 0.0f;
419 float2 pixels1 = 0.0f;
420
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100421 // Extract channel and linearized batch indices
422 const int channel = get_global_id(2) % DST_CHANNELS;
423 const int batch = get_global_id(2) / DST_CHANNELS;
424 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
425 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
426 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Gian Marcoc799ed82018-02-01 16:57:48 +0000427
428 // Load the weights
429 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
430 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
431 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
432
433 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
434 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
435 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
436 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
437 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
438 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
439 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
440 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
441 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
442 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
443 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
444
445 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
446 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
447 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
448 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
449 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
450 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
451
452#ifdef HAS_BIAS
453 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
454
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100455 float bias = *((__global float *)(vector_offset(&biases, channel)));
Gian Marcoc799ed82018-02-01 16:57:48 +0000456
457 pixels0 += (float2)bias;
458 pixels1 += (float2)bias;
459#endif /* defined(HAS_BIAS) */
460
461 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
462 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
463}
464
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100465#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arena76572242018-04-04 17:44:26 +0100466
Giorgio Arenad051e972018-06-20 11:46:42 +0100467#if defined(NCHW)
468#define in_stride_x src_stride_x
469#define in_stride_y src_stride_y
470#define in_stride_z src_stride_z
471#define out_stride_x dst_stride_x
472#define out_stride_y dst_stride_y
473#define out_stride_z dst_stride_z
474#else //defined(NCHW)
475#define in_stride_x src_stride_y
476#define in_stride_y src_stride_z
477#define in_stride_z src_stride_x
478#define out_stride_x dst_stride_y
479#define out_stride_y dst_stride_z
480#define out_stride_z dst_stride_x
481#endif //defined(NCHW)
482
Giorgio Arena9fe41442017-08-23 16:36:24 +0100483#if defined(SRC_WIDTH) && defined(DATA_TYPE)
484/** This kernel reshapes each of the tensor's low three dimensions to single rows.
485 *
486 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
487 *
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100488 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
489 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
490 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
491 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
492 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
493 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
494 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
495 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
496 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
497 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
498 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
499 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
500 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
501 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
502 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
503 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
504 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
505 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Giorgio Arena9fe41442017-08-23 16:36:24 +0100506 */
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100507__kernel void depthwise_weights_reshape(
508 TENSOR3D_DECLARATION(src),
509 IMAGE_DECLARATION(dst)
510#ifdef HAS_BIAS
511 ,
512 VECTOR_DECLARATION(biases)
513#endif /* HAS_BIAS */
514)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100515{
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100516#ifdef HAS_BIAS
517 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
518#endif /* HAS_BIAS */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100519
Giorgio Arenad051e972018-06-20 11:46:42 +0100520 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
521 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100522
Giorgio Arenad051e972018-06-20 11:46:42 +0100523 for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100524 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100525 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);
Giorgio Arena9fe41442017-08-23 16:36:24 +0100526 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100527
528#if defined(HAS_BIAS)
529 if(get_global_id(1) == 0)
530 {
Michele Di Giorgiod24af8a2018-05-08 17:23:52 +0100531 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100532 }
533#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100534}
535#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
536
Giorgio Arena76572242018-04-04 17:44:26 +0100537#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100538/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
539 *
540 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
Giorgio Arena76572242018-04-04 17:44:26 +0100541 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
Giorgio Arena9fe41442017-08-23 16:36:24 +0100542 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100543 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100544 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
545 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
546 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
547 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
548 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
549 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
550 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
551 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
552 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
553 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
554 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
555 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
556 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
557 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
559 */
Giorgio Arena9fe41442017-08-23 16:36:24 +0100560__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
561{
562 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
563
564 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100565 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100566 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
567
Jaroslaw Rzepeckia1ed41f2017-10-13 11:13:58 +0100568 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
569 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
Giorgio Arena76572242018-04-04 17:44:26 +0100570 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100571
Giorgio Arenad051e972018-06-20 11:46:42 +0100572 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100573 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
574
575 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
576 {
577 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
578 {
579 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
580 {
Georgios Pinitasde5a1cc2018-02-02 12:52:07 +0000581 *output_ptr = PAD_VALUE;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100582 }
583 else
584 {
Giorgio Arenad051e972018-06-20 11:46:42 +0100585 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));
Giorgio Arena9fe41442017-08-23 16:36:24 +0100586 }
587 }
588 }
Georgios Pinitas81a26ad2017-10-23 20:29:30 +0100589#if defined(HAS_BIAS)
590 *output_ptr = (DATA_TYPE)(1);
591#endif // defined(HAS_BIAS)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100592}
593
Giorgio Arena76572242018-04-04 17:44:26 +0100594#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
Giorgio Arena9fe41442017-08-23 16:36:24 +0100595
596#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
597
598/** This kernel performs a reshaping of the output of the depthwise generic convolution.
599 *
600 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
601 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
602 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100603 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
Giorgio Arena9fe41442017-08-23 16:36:24 +0100604 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
605 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
606 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
607 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
608 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
609 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
610 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
611 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
612 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
613 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
614 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
615 */
616__kernel void depthwise_vector_to_tensor(
617 VECTOR_DECLARATION(src),
618 TENSOR3D_DECLARATION(dst))
619{
620 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
621
622 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
623 const int id0 = get_global_id(0);
624 const int z = id0 / patch_size;
625 const int index2D = id0 - z * patch_size;
626
Giorgio Arenad051e972018-06-20 11:46:42 +0100627 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;
Giorgio Arena9fe41442017-08-23 16:36:24 +0100628 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
629}
630
631#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000632
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100633#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000634#if defined(CONV_STRIDE_X)
635#if CONV_STRIDE_X == 1
636#define convolution1x3_f16 convolution1x3_stride_1_f16
637#elif CONV_STRIDE_X == 2
638#define convolution1x3_f16 convolution1x3_stride_2_f16
639#elif CONV_STRIDE_X == 3
640#define convolution1x3_f16 convolution1x3_stride_3_f16
641#else /* CONV_STRIDE_X */
642#error "Stride not supported"
643#endif /* CONV_STRIDE_X */
644
645/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
646 *
647 * @param[in] left_pixel Pointer to the left pixel.
648 * @param[in] left_coeff Weight of the left pixel
649 * @param[in] middle_coeff Weight of the middle pixel
650 * @param[in] right_coeff Weight of the right pixel
651 *
652 * @return a half4 containing 4 convoluted values.
653 */
654inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
655 const half left_coeff,
656 const half middle_coeff,
657 const half right_coeff)
658{
659 half8 temp = vload8(0, (__global half *)left_pixel);
660
661 half4 left = CONVERT(temp.s0123, half4);
662 half4 middle = CONVERT(temp.s1234, half4);
663 half4 right = CONVERT(temp.s2345, half4);
664
665 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
666}
667
668/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
669 *
670 * @param[in] left_pixel Pointer to the left pixel.
671 * @param[in] left_coeff Weight of the left pixel
672 * @param[in] middle_coeff Weight of the middle pixel
673 * @param[in] right_coeff Weight of the right pixel
674 *
675 * @return a half4 containing 4 convoluted values.
676 */
677inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
678 const half left_coeff,
679 const half middle_coeff,
680 const half right_coeff)
681{
682 half8 temp0 = vload8(0, (__global half *)left_pixel);
683 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
684
685 half4 left = CONVERT(temp0.s0246, half4);
686 half4 middle = CONVERT(temp0.s1357, half4);
687 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
688
689 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
690}
691
692/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
693 *
694 * @param[in] left_pixel Pointer to the left pixel.
695 * @param[in] left_coeff Weight of the left pixel
696 * @param[in] middle_coeff Weight of the middle pixel
697 * @param[in] right_coeff Weight of the right pixel
698 *
699 * @return a half4 containing 4 convoluted values.
700 */
701inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
702 const half left_coeff,
703 const half middle_coeff,
704 const half right_coeff)
705{
706 half16 temp0 = vload16(0, (__global half *)left_pixel);
707
708 half4 left = CONVERT(temp0.s0369, half4);
709 half4 middle = CONVERT(temp0.s147A, half4);
710 half4 right = CONVERT(temp0.s258B, half4);
711
712 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
713}
714
715/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
716 *
717 * Convolution matrix layout:
718 *
719 * [ mat0, mat1, mat2 ]\n
720 * [ mat3, mat4, mat5 ]\n
721 * [ mat6, mat7, mat8 ]\n
722 *
723 * @param[in] src A pointer to source Image structure
724 * @param[in] mat0 Coefficient from the convolution matrix
725 * @param[in] mat1 Coefficient from the convolution matrix
726 * @param[in] mat2 Coefficient from the convolution matrix
727 * @param[in] mat3 Coefficient from the convolution matrix
728 * @param[in] mat4 Coefficient from the convolution matrix
729 * @param[in] mat5 Coefficient from the convolution matrix
730 * @param[in] mat6 Coefficient from the convolution matrix
731 * @param[in] mat0 Coefficient from the convolution matrix
732 * @param[in] mat7 Coefficient from the convolution matrix
733 * @param[in] mat8 Coefficient from the convolution matrix
734 *
735 * @return a half4 containing 4 convoluted values.
736 */
737inline half4 convolution3x3_f16(
738 Image *src,
739 const half mat0, const half mat1, const half mat2,
740 const half mat3, const half mat4, const half mat5,
741 const half mat6, const half mat7, const half mat8)
742{
743 half4 pixels;
744
745 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
746 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
747 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
748
749 return pixels;
750}
751
Giorgio Arena76572242018-04-04 17:44:26 +0100752#if defined(DEPTH_MULTIPLIER)
753
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000754/** This OpenCL kernel computes the depthwise convolution 3x3
755 *
756 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
757 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
758 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
759 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
760 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
761 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
762 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
763 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000764 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000765 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
766 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
767 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
768 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
769 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
770 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
771 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000772 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000773 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
774 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
775 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
776 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
777 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
778 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
779 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
780 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
781 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
782 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
783 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
784 */
785__kernel void depthwise_convolution_3x3_f16(
786 TENSOR3D_DECLARATION(src),
787 TENSOR3D_DECLARATION(dst),
788 TENSOR3D_DECLARATION(weights)
789#if defined(HAS_BIAS)
790 ,
791 VECTOR_DECLARATION(biases)
792#endif //defined(HAS_BIAS)
793)
794{
795 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
796 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100797 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000798#if defined(HAS_BIAS)
799 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
800#endif //defined(HAS_BIAS)
801
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100802 // Extract channel and linearized batch indices
803 const int channel = get_global_id(2) % DST_CHANNELS;
804 const int batch = get_global_id(2) / DST_CHANNELS;
805 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
806 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
807 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100808
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000809 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100810 half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
811 half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
812 half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000813
814 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
815 weights_values1.s0, weights_values1.s1, weights_values1.s2,
816 weights_values2.s0, weights_values2.s1, weights_values2.s2);
817#if defined(HAS_BIAS)
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100818 pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000819#endif //defined(HAS_BIAS)
820
821 vstore4(pixels, 0, (__global half *)dst.ptr);
822}
Giorgio Arena76572242018-04-04 17:44:26 +0100823#endif // defined(DEPTH_MULTIPLIER)
Michele Di Giorgio933fe862018-02-19 15:42:12 +0000824#endif // defined(CONV_STRIDE_X)
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000825
826/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
827 * when both stride_x and stride_y are equal to 1
828 *
829 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
830 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
831 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
832 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
833 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
834 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
835 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
836 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
837 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
838 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
839 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
840 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
841 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
842 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
843 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
844 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
845 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
846 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
847 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
848 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
849 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
850 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
851 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
852 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
853 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
854 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
855 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
856 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
857 */
858__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
859 TENSOR3D_DECLARATION(src),
860 TENSOR3D_DECLARATION(dst),
861 TENSOR3D_DECLARATION(weights)
862#if defined(HAS_BIAS)
863 ,
864 VECTOR_DECLARATION(biases)
865#endif //defined(HAS_BIAS)
866)
867{
868 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
869 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100870 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
871
872 // Extract channel and linearized batch indices
873 const int channel = get_global_id(2) % DST_CHANNELS;
874 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000875
876#ifdef HAS_BIAS
877 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
878
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100879 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000880#endif /* defined(HAS_BIAS) */
881
882 half4 pixels0 = 0.0f;
883 half4 pixels1 = 0.0f;
884 half4 pixels2 = 0.0f;
885 half4 pixels3 = 0.0f;
886
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100887 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
888 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
889 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000890
891 // Load the weights
892 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
893 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
894 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
895
896 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
897 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
898 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
899 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
900 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
901 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
902 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
903
904 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
905 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
906 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
907 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
908 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
909 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
910 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
911 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
912 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
913 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
914 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
915 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
916
917#ifdef HAS_BIAS
918 pixels0 += (half4)bias;
919 pixels1 += (half4)bias;
920 pixels2 += (half4)bias;
921 pixels3 += (half4)bias;
922#endif /* defined(HAS_BIAS) */
923
924 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
925 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
926 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
927 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
928}
929
930/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
931 * when both stride_x and stride_y are equal to 2
932 *
933 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
934 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
935 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
936 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
937 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
938 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
939 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
940 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
941 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
942 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
943 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
944 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
945 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
946 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
947 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
948 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
949 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
950 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
951 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
952 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
953 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
954 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
955 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
956 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
957 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
958 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
959 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
960 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
961 */
962__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
963 TENSOR3D_DECLARATION(src),
964 TENSOR3D_DECLARATION(dst),
965 TENSOR3D_DECLARATION(weights)
966#if defined(HAS_BIAS)
967 ,
968 VECTOR_DECLARATION(biases)
969#endif //defined(HAS_BIAS)
970)
971{
972 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
973 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100974 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
975
976 // Extract channel and linearized batch indices
977 const int channel = get_global_id(2) % DST_CHANNELS;
978 const int batch = get_global_id(2) / DST_CHANNELS;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000979
980#ifdef HAS_BIAS
981 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
982
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100983 half bias = *((__global half *)(vector_offset(&biases, channel)));
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000984#endif /* defined(HAS_BIAS) */
985
986 half4 pixels0 = 0.0f;
987 half4 pixels1 = 0.0f;
988
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100989 // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
990 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
991 __global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
Michele Di Giorgio3ebef322018-02-21 10:02:58 +0000992
993 // Load the weights
994 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
995 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
996 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
997
998 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
999 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1000 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
1001 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1002 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
1003 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1004 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
1005 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1006 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
1007 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1008 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
1009
1010 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
1011 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
1012 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
1013 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
1014 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
1015 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
1016
1017#ifdef HAS_BIAS
1018 pixels0 += (half4)bias;
1019 pixels1 += (half4)bias;
1020#endif /* defined(HAS_BIAS) */
1021
1022 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
1023 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
1024}
Georgios Pinitase55b40a2018-09-13 17:20:04 +01001025#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenad051e972018-06-20 11:46:42 +01001026
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001027#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001028
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001029#if DATA_TYPE != float || DATA_TYPE != half
1030#error "Unsupported data type"
1031#endif // DATA_TYPE != float || DATA_TYPE != half
1032
1033#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arenad051e972018-06-20 11:46:42 +01001034
1035#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1036/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
1037 *
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001038 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
Giorgio Arenad051e972018-06-20 11:46:42 +01001039 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1040 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1041 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1042 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1043 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1044 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1045 *
1046 * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
1047 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1048 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1049 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1050 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1051 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1052 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1053 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1054 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1055 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1056 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1057 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1058 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1059 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1060 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1061 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1062 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1063 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1064 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1065 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1066 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1067 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1068 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1069 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1070 * @param[in] max_offset Max offset for the input tensor
1071 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1072 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1073 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1074 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1075 */
1076__kernel void depthwise_convolution_3x3_nhwc(
1077 TENSOR3D_DECLARATION(src),
1078 TENSOR3D_DECLARATION(dst),
1079 TENSOR3D_DECLARATION(weights),
1080#if defined(HAS_BIAS)
1081 VECTOR_DECLARATION(biases),
1082#endif /* defined(HAS_BIAS) */
1083 int max_offset)
1084{
1085 int x = get_global_id(0); // channels
1086 int y = get_global_id(1); // spatial coordinate x
1087 int z = get_global_id(2); // spatial coordinate y
1088
1089 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1090
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001091 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Giorgio Arenad051e972018-06-20 11:46:42 +01001092
1093 int z_coord = 0;
1094 int4 offset = 0;
1095 int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - CONV_PAD_LEFT) * (int4)src_stride_y;
1096
1097 // We compute 2x1x1 [C,W,H] elements
1098 VEC_FLOAT acc = 0;
1099
1100 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001101 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1102 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1103 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1104 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1105 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1106 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1107 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1108 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1109 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001110
1111 // Load input values
1112 // z == 0
1113 // Clamp z_coord as for z = 0, it can be negative
1114 // z_coord is casted to unsigned int in order to use just a min() operation
1115 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
1116 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1117 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1118 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001119 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001120
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001121 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1122 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1123 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001124
1125 // z == 1
1126 // z_coord can be only negative for z = 0 so we do not need to clamp it
1127 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
1128 z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
1129 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001130 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1131 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1132 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001133
1134 // z == 2
1135 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1136 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1137 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001138 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001139 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1140 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1141 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
Giorgio Arenad051e972018-06-20 11:46:42 +01001142
1143 acc = fma(values0, w0, acc);
1144 acc = fma(values1, w1, acc);
1145 acc = fma(values2, w2, acc);
1146
1147 acc = fma(values3, w3, acc);
1148 acc = fma(values4, w4, acc);
1149 acc = fma(values5, w5, acc);
1150
1151 acc = fma(values6, w6, acc);
1152 acc = fma(values7, w7, acc);
1153 acc = fma(values8, w8, acc);
1154
1155#if defined(HAS_BIAS)
1156 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001157 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001158 acc += bias_values;
1159#endif // defined(HAS_BIAS)
1160
1161 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
1162 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001163 (acc, 0, (__global DATA_TYPE *)(dst.ptr));
Giorgio Arenad051e972018-06-20 11:46:42 +01001164}
1165#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
1166
1167#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
1168/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.
1169 *
1170 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1171 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1172 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1173 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1174 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1175 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1176 *
1177 * @param[in] src_ptr Pointer to the source image. Supported data types: FP32
1178 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
1179 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1180 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
1181 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1182 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
1183 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1184 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
1185 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1186 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1187 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1188 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1189 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1190 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1191 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1192 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1193 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1194 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1195 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1196 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1197 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1198 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1199 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1200 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1201 * @param[in] max_offset Max offset for the input tensor
1202 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1203 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1204 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1205 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1206 */
1207__kernel void depthwise_convolution_3x3_nhwc_stride1(
1208 TENSOR3D_DECLARATION(src),
1209 TENSOR3D_DECLARATION(dst),
1210 TENSOR3D_DECLARATION(weights),
1211#if defined(HAS_BIAS)
1212 VECTOR_DECLARATION(biases),
1213#endif /* defined(HAS_BIAS) */
1214 int max_offset)
1215{
1216 int x = get_global_id(0); // channels
1217 int y = get_global_id(1); // spatial coordinate x
1218 int z = get_global_id(2); // spatial coordinate y
1219
1220 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
1221
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001222 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
Giorgio Arenad051e972018-06-20 11:46:42 +01001223
1224 int z_coord = 0;
1225 int4 offset = 0;
Georgios Pinitased32f432018-07-10 17:03:11 +01001226 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenad051e972018-06-20 11:46:42 +01001227
1228 // We compute 2x2x2 [C,W,H] elements
1229 VEC_FLOAT acc0 = 0;
1230 VEC_FLOAT acc1 = 0;
1231 VEC_FLOAT acc2 = 0;
1232 VEC_FLOAT acc3 = 0;
1233
1234 // Load weights
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001235 VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
1236 VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
1237 VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
1238 VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
1239 VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
1240 VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
1241 VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
1242 VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
1243 VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001244
1245 // Load input values
1246 // z == 0
1247 // Clamp z_coord as for z = 0, it can be negative
1248 // z_coord is casted to unsigned int in order to use just a min() operation
1249 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitased32f432018-07-10 17:03:11 +01001250 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenad051e972018-06-20 11:46:42 +01001251 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1252 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitased32f432018-07-10 17:03:11 +01001253 offset = min(offset, (int4)max_offset);
Giorgio Arenad051e972018-06-20 11:46:42 +01001254
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001255 VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1256 VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1257 VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1258 VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001259
1260 // z == 1
1261 // z_coord can be only negative for z = 0 so we do not need to clamp it
1262 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001263 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenad051e972018-06-20 11:46:42 +01001264 offset = y_offset + (int4)(z_coord * src_stride_z);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001265 VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1266 VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1267 VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1268 VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001269
1270 // z == 2
1271 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1272 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1273 offset += (int4)src_stride_z;
Georgios Pinitased32f432018-07-10 17:03:11 +01001274 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001275 VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1276 VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1277 VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1278 VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001279
1280 // z == 3
1281 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1282 // However offset can be out-of-bound so we need to check if it is greater than max_offset
Georgios Pinitased32f432018-07-10 17:03:11 +01001283 offset += (int4)src_stride_z;
1284 offset = min(offset, (int4)max_offset);
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001285 VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
1286 VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
1287 VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
1288 VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
Giorgio Arenad051e972018-06-20 11:46:42 +01001289
1290 acc0 = fma(values0, w0, acc0);
1291 acc0 = fma(values1, w1, acc0);
1292 acc0 = fma(values2, w2, acc0);
1293 acc1 = fma(values1, w0, acc1);
1294 acc1 = fma(values2, w1, acc1);
1295 acc1 = fma(values3, w2, acc1);
1296
1297 acc0 = fma(values4, w3, acc0);
1298 acc0 = fma(values5, w4, acc0);
1299 acc0 = fma(values6, w5, acc0);
1300 acc1 = fma(values5, w3, acc1);
1301 acc1 = fma(values6, w4, acc1);
1302 acc1 = fma(values7, w5, acc1);
1303
1304 acc0 = fma(values8, w6, acc0);
1305 acc0 = fma(values9, w7, acc0);
1306 acc0 = fma(values10, w8, acc0);
1307 acc1 = fma(values9, w6, acc1);
1308 acc1 = fma(values10, w7, acc1);
1309 acc1 = fma(values11, w8, acc1);
1310
1311 acc2 = fma(values4, w0, acc2);
1312 acc2 = fma(values5, w1, acc2);
1313 acc2 = fma(values6, w2, acc2);
1314 acc3 = fma(values5, w0, acc3);
1315 acc3 = fma(values6, w1, acc3);
1316 acc3 = fma(values7, w2, acc3);
1317
1318 acc2 = fma(values8, w3, acc2);
1319 acc2 = fma(values9, w4, acc2);
1320 acc2 = fma(values10, w5, acc2);
1321 acc3 = fma(values9, w3, acc3);
1322 acc3 = fma(values10, w4, acc3);
1323 acc3 = fma(values11, w5, acc3);
1324
1325 acc2 = fma(values12, w6, acc2);
1326 acc2 = fma(values13, w7, acc2);
1327 acc2 = fma(values14, w8, acc2);
1328 acc3 = fma(values13, w6, acc3);
1329 acc3 = fma(values14, w7, acc3);
1330 acc3 = fma(values15, w8, acc3);
1331
1332#if defined(HAS_BIAS)
1333 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1334
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001335 VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
Giorgio Arenad051e972018-06-20 11:46:42 +01001336
1337 acc0 += bias_values;
1338 acc1 += bias_values;
1339 acc2 += bias_values;
1340 acc3 += bias_values;
1341#endif // defined(HAS_BIAS)
1342
1343 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
1344
1345 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001346 (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001347 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001348 (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
Giorgio Arenad051e972018-06-20 11:46:42 +01001349
1350#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1351 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1352#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1353 {
1354 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001355 (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001356 VSTORE(VEC_SIZE)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001357 (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
Giorgio Arenad051e972018-06-20 11:46:42 +01001358 }
1359}
1360
1361#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenae6bb3c62018-08-23 11:19:11 +01001362#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)