blob: fe902ed98187c40ecc378a69db07214817aaadb9 [file] [log] [blame]
Dmitry Savenkod7295b72017-11-20 22:00:08 +07001/*
Giorgio Arena944d3f72018-01-16 15:38:35 +00002 * Copyright (c) 2017-2018 ARM Limited.
Dmitry Savenkod7295b72017-11-20 22:00:08 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers_asymm.h"
26
Giorgio Arenadfca60b2018-01-31 10:30:59 +000027#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
Giorgio Arena287b5702018-02-16 11:01:04 +000028
Giorgio Arena99ac60b2018-02-16 15:17:23 +000029#if defined(FUSED_ACTIVATION)
30#define DATA_TYPE uchar
Giorgio Arenadfca60b2018-01-31 10:30:59 +000031#ifndef VEC_SIZE
Giorgio Arena99ac60b2018-02-16 15:17:23 +000032#define VEC_SIZE 8
Giorgio Arenadfca60b2018-01-31 10:30:59 +000033#endif /* VEC_SIZE */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000034#include "activation_layer_qa8.cl"
35#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
36#else /* defined(FUSED_ACTIVATION) */
37#define ACTIVATION_FUNC(x) (x)
38#endif /* defined(FUSED_ACTIVATION) */
39
Giorgio Arenaeff8d952018-07-02 15:29:57 +010040#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
41#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
42#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
43#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
44#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
45#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
46#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
47
48#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +000049
Giorgio Arena287b5702018-02-16 11:01:04 +000050#if CONV_STRIDE_X > 3
51#error "Stride X not supported"
52#endif /* CONV_STRIDE_X > 3 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +070053
Giorgio Arenaeff8d952018-07-02 15:29:57 +010054#if !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
55
Dmitry Savenkod7295b72017-11-20 22:00:08 +070056#if CONV_STRIDE_X == 1
Giorgio Arena287b5702018-02-16 11:01:04 +000057#define GET_VALUES(first_value, left, middle, right) \
58 ({ \
59 int8 temp0 = CONVERT(vload8(0, first_value), int8); \
60 int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
61 \
62 left = CONVERT(temp0.s01234567, int8); \
63 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
64 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
65 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070066#elif CONV_STRIDE_X == 2
Giorgio Arena287b5702018-02-16 11:01:04 +000067#define GET_VALUES(first_value, left, middle, right) \
68 ({ \
69 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
70 int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
71 \
72 left = CONVERT(temp0.s02468ace, int8); \
73 middle = CONVERT(temp0.s13579bdf, int8); \
74 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
75 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070076#else /* CONV_STRIDE_X */
Giorgio Arena287b5702018-02-16 11:01:04 +000077#define GET_VALUES(first_value, left, middle, right) \
78 ({ \
79 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
80 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
81 \
82 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
83 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
84 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
85 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070086#endif /* CONV_STRIDE_X */
87
Giorgio Arenadfca60b2018-01-31 10:30:59 +000088/** This function computes the depthwise convolution quantized.
Anthony Barbierf202e502017-11-23 18:02:04 +000089 *
90 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
91 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
92 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
93 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
94 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
95 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
96 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
97 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
98 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
99 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
100 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
102 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
104 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
105 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
106 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
107 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
108 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
109 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
110 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
111 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
112 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
113 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
114 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
115 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
116 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
117 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Anthony Barbierf202e502017-11-23 18:02:04 +0000118 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700119
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000120__kernel void depthwise_convolution_3x3_quantized_nchw(
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700121 TENSOR3D_DECLARATION(src),
122 TENSOR3D_DECLARATION(dst),
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000123 TENSOR3D_DECLARATION(weights)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700124#if defined(HAS_BIAS)
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000125 ,
Giorgio Arena287b5702018-02-16 11:01:04 +0000126 VECTOR_DECLARATION(biases)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700127#endif //defined(HAS_BIAS)
Giorgio Arena287b5702018-02-16 11:01:04 +0000128)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700129{
130 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
131 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
132 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
133#if defined(HAS_BIAS)
134 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700135
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700136 int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
137#endif //defined(HAS_BIAS)
138
Giorgio Arena76572242018-04-04 17:44:26 +0100139 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
140
Giorgio Arena287b5702018-02-16 11:01:04 +0000141 uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
142 uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
143 uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700144
Giorgio Arena287b5702018-02-16 11:01:04 +0000145 int8 values0 = 0;
146 int8 sum0 = 0;
147#if CONV_STRIDE_Y == 1
148 int8 values1 = 0;
149 int8 sum1 = 0;
150#endif /* CONV_STRIDE_Y */
151
152 // Row0
153 int8 left, middle, right;
154 GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
155 values0 += left * (int8)(w0.s0);
156 values0 += middle * (int8)(w0.s1);
157 values0 += right * (int8)(w0.s2);
158
159#if WEIGHTS_OFFSET != 0
160 sum0 += left + middle + right;
161#endif /* WEIGHTS_OFFSET != 0 */
162
163 // Row1
164 GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
165 values0 += left * (int8)(w1.s0);
166 values0 += middle * (int8)(w1.s1);
167 values0 += right * (int8)(w1.s2);
168#if CONV_STRIDE_Y == 1
169 values1 += left * (int8)(w0.s0);
170 values1 += middle * (int8)(w0.s1);
171 values1 += right * (int8)(w0.s2);
172#endif /* CONV_STRIDE_Y == 1 */
173
174#if WEIGHTS_OFFSET != 0
175 int8 tmp = left + middle + right;
176 sum0 += tmp;
177#if CONV_STRIDE_Y == 1
178 sum1 += tmp;
179#endif /* CONV_STRIDE_Y == 1 */
180#endif /* WEIGHTS_OFFSET != 0 */
181
182 // Row2
183 GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
184 values0 += left * (int8)(w2.s0);
185 values0 += middle * (int8)(w2.s1);
186 values0 += right * (int8)(w2.s2);
187#if CONV_STRIDE_Y == 1
188 values1 += left * (int8)(w1.s0);
189 values1 += middle * (int8)(w1.s1);
190 values1 += right * (int8)(w1.s2);
191#endif /* CONV_STRIDE_Y == 1 */
192
193#if WEIGHTS_OFFSET != 0
194 tmp = left + middle + right;
195 sum0 += tmp;
196#if CONV_STRIDE_Y == 1
197 sum1 += tmp;
198#endif /* CONV_STRIDE_Y == 1 */
199#endif /* WEIGHTS_OFFSET != 0 */
200
201#if CONV_STRIDE_Y == 1
202 // Row3
203 GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
204 values1 += left * (int8)(w2.s0);
205 values1 += middle * (int8)(w2.s1);
206 values1 += right * (int8)(w2.s2);
207
208#if WEIGHTS_OFFSET != 0
209 sum1 += left + middle + right;
210#endif /* WEIGHTS_OFFSET != 0 */
211#endif /* CONV_STRIDE_Y == 1 */
212
213#if defined(HAS_BIAS)
214 values0 += (int8)(bias_value);
215#if CONV_STRIDE_Y == 1
216 values1 += (int8)(bias_value);
217#endif /* CONV_STRIDE_Y == 1 */
218#endif //defined(HAS_BIAS)
219
220#if WEIGHTS_OFFSET != 0
221 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
222#if CONV_STRIDE_Y == 1
223 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
224#endif /* CONV_STRIDE_Y == 1 */
225#endif /* WEIGHTS_OFFSET != 0 */
226
227#if INPUT_OFFSET != 0
228 ushort sum_weights = 0;
229 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
230 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
231 values0 += sum_weights * (int8)(INPUT_OFFSET);
232#if CONV_STRIDE_Y == 1
233 values1 += sum_weights * (int8)(INPUT_OFFSET);
234#endif /* CONV_STRIDE_Y == 1 */
235#endif /* INPUT_OFFSET != 0 */
236
237#if K_OFFSET != 0
238 values0 += (int8)(K_OFFSET);
239#if CONV_STRIDE_Y == 1
240 values1 += (int8)(K_OFFSET);
241#endif /* CONV_STRIDE_Y == 1 */
242#endif /* K_OFFSET != 0 */
243
244 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
245 values0 += (int8)OUTPUT_OFFSET;
246 uchar8 res0 = convert_uchar8_sat(values0);
247 res0 = max(res0, (uchar8)0);
248 res0 = min(res0, (uchar8)255);
249
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000250 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Giorgio Arena287b5702018-02-16 11:01:04 +0000251#if CONV_STRIDE_Y == 1
252
253 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
254 values1 += (int8)OUTPUT_OFFSET;
255 uchar8 res1 = convert_uchar8_sat(values1);
256 res1 = max(res1, (uchar8)0);
257 res1 = min(res1, (uchar8)255);
258
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000259 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Giorgio Arena287b5702018-02-16 11:01:04 +0000260#endif /* CONV_STRIDE_Y == 1 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700261}
Giorgio Arena287b5702018-02-16 11:01:04 +0000262
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100263#else // !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000264
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100265#if CONV_STRIDE_X == 1
266#define GET_VALUES(first_value, left, middle, right) \
267 ({ \
268 uchar8 temp0 = vload8(0, first_value); \
269 uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \
270 \
271 left = temp0.s01234567; \
272 middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \
273 right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000274 })
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100275#elif CONV_STRIDE_X == 2
276#define GET_VALUES(first_value, left, middle, right) \
277 ({ \
278 uchar16 temp0 = vload16(0, first_value); \
279 uchar temp1 = *(first_value + 16 * sizeof(uchar)); \
280 \
281 left = temp0.s02468ace; \
282 middle = temp0.s13579bdf; \
283 right = (uchar8)(temp0.s2468, temp0.sace, temp1); \
284 })
285#else /* CONV_STRIDE_X */
286#define GET_VALUES(first_value, left, middle, right) \
287 ({ \
288 uchar16 temp0 = vload16(0, first_value); \
289 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
290 \
291 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
292 middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \
293 right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \
294 })
295#endif /* CONV_STRIDE_X */
296/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000297 *
298 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
299 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
300 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
301 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
302 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
303 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
304 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
305 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
306 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
307 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
308 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
309 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
310 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
311 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
312 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
313 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
314 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
315 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
316 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
317 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
318 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
319 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
320 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
321 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
322 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
323 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
324 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
325 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
326 */
327
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100328__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
329 TENSOR3D_DECLARATION(src),
330 TENSOR3D_DECLARATION(dst),
331 TENSOR3D_DECLARATION(weights)
332#if defined(HAS_BIAS)
333 ,
334 VECTOR_DECLARATION(biases)
335#endif //defined(HAS_BIAS)
336)
337{
338 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
339 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
340 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
341#if defined(HAS_BIAS)
342 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
343
344 const int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
345#endif //defined(HAS_BIAS)
346
347 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
348
349 uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
350 uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
351 uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
352
353 uchar8 left0, middle0, right0;
354 uchar8 left1, middle1, right1;
355 uchar8 left2, middle2, right2;
356
357 int8 values0 = 0;
358 int8 sum0 = 0;
359
360 GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
361 GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);
362 GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);
363
364#if WEIGHTS_OFFSET != 0
365 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
366 sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
367 sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
368#endif /* WEIGHTS_OFFSET != 0 */
369
370#if CONV_STRIDE_Y == 1
371 // If conv_stride_y is equals to 1, we compute two output rows
372
373 uchar8 left3, middle3, right3;
374 int8 values1 = 0;
375 int8 sum1 = 0;
376
377 GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
378
379#if WEIGHTS_OFFSET != 0
380 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
381 sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
382 sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
383#endif /* WEIGHTS_OFFSET != 0 */
384#endif // CONV_STRIDE_Y == 1
385
386 ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0);
387 ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0);
388 values0.s0 += right2.s0 * w2.s2;
389
390 ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1);
391 ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1);
392 values0.s1 += right2.s1 * w2.s2;
393
394 ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2);
395 ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2);
396 values0.s2 += right2.s2 * w2.s2;
397
398 ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3);
399 ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3);
400 values0.s3 += right2.s3 * w2.s2;
401
402 ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4);
403 ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4);
404 values0.s4 += right2.s4 * w2.s2;
405
406 ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5);
407 ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5);
408 values0.s5 += right2.s5 * w2.s2;
409
410 ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6);
411 ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6);
412 values0.s6 += right2.s6 * w2.s2;
413
414 ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7);
415 ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7);
416 values0.s7 += right2.s7 * w2.s2;
417
418#if CONV_STRIDE_Y == 1
419 ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0);
420 ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0);
421 values1.s0 += right3.s0 * w2.s2;
422
423 ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1);
424 ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1);
425 values1.s1 += right3.s1 * w2.s2;
426
427 ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2);
428 ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2);
429 values1.s2 += right3.s2 * w2.s2;
430
431 ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3);
432 ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3);
433 values1.s3 += right3.s3 * w2.s2;
434
435 ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4);
436 ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4);
437 values1.s4 += right3.s4 * w2.s2;
438
439 ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5);
440 ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5);
441 values1.s5 += right3.s5 * w2.s2;
442
443 ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6);
444 ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6);
445 values1.s6 += right3.s6 * w2.s2;
446
447 ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7);
448 ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7);
449 values1.s7 += right3.s7 * w2.s2;
450#endif // CONV_STRIDE_Y == 1
451
452#if defined(HAS_BIAS)
453 values0 += (int8)(bias_value);
454#if CONV_STRIDE_Y == 1
455 values1 += (int8)(bias_value);
456#endif /* CONV_STRIDE_Y == 1 */
457#endif //defined(HAS_BIAS)
458
459#if WEIGHTS_OFFSET != 0
460 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
461#if CONV_STRIDE_Y == 1
462 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
463#endif /* CONV_STRIDE_Y == 1 */
464#endif /* WEIGHTS_OFFSET != 0 */
465
466#if INPUT_OFFSET != 0
467 ushort sum_weights = 0;
468 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
469 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
470 values0 += sum_weights * (int8)(INPUT_OFFSET);
471#if CONV_STRIDE_Y == 1
472 values1 += sum_weights * (int8)(INPUT_OFFSET);
473#endif /* CONV_STRIDE_Y == 1 */
474#endif /* INPUT_OFFSET != 0 */
475
476#if K_OFFSET != 0
477 values0 += (int8)(K_OFFSET);
478#if CONV_STRIDE_Y == 1
479 values1 += (int8)(K_OFFSET);
480#endif /* CONV_STRIDE_Y == 1 */
481#endif /* K_OFFSET != 0 */
482
483 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
484 values0 += (int8)OUTPUT_OFFSET;
485 uchar8 res0 = convert_uchar8_sat(values0);
486 res0 = max(res0, (uchar8)0);
487 res0 = min(res0, (uchar8)255);
488
489 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
490#if CONV_STRIDE_Y == 1
491
492 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
493 values1 += (int8)OUTPUT_OFFSET;
494 uchar8 res1 = convert_uchar8_sat(values1);
495 res1 = max(res1, (uchar8)0);
496 res1 = min(res1, (uchar8)255);
497
498 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
499#endif /* CONV_STRIDE_Y == 1 */
500}
501
502#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
503
504#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) */
505
506#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
507
508#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
509
510#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
511#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
512#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
513
514#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)
515
516#if WEIGHTS_OFFSET != 0
517#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
518 ({ \
519 sum += CONVERT(x, VEC_INT); \
520 MULTIPLY_ADD(x, y, acc); \
521 })
522#else /* WEIGHTS_OFFSET != 0 */
523#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
524#endif /* WEIGHTS_OFFSET != 0 */
525
526#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
527#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
528 ({ \
529 ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \
530 ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0); \
531 acc.s0 += val8.s0 * w8.s0; \
532 \
533 ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1); \
534 ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1); \
535 acc.s1 += val8.s1 * w8.s1; \
536 \
537 ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2); \
538 ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2); \
539 acc.s2 += val8.s2 * w8.s2; \
540 \
541 ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3); \
542 ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3); \
543 acc.s3 += val8.s3 * w8.s3; \
544 })
545
546#if WEIGHTS_OFFSET != 0
547#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
548 ({ \
549 sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \
550 DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8); \
551 })
552#else /* WEIGHTS_OFFSET != 0 */
553#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
554#endif /* WEIGHTS_OFFSET != 0 */
555
556#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
557
558#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
559/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
560 *
561 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
562 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
563 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
564 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
565 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
566 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
567 *
568 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
569 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
570 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
571 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
572 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
573 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
574 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
575 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
576 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
577 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
578 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
579 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
580 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
581 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
582 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
583 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
584 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
585 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
586 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
587 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
588 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
589 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
590 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
591 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
592 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
593 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
594 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
595 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
596 * @param[in] max_offset Max offset for the input tensor
597 */
598__kernel void depthwise_convolution_3x3_quantized_nhwc(
599 TENSOR3D_DECLARATION(src),
600 TENSOR3D_DECLARATION(dst),
601 TENSOR3D_DECLARATION(weights),
602#if defined(HAS_BIAS)
603 VECTOR_DECLARATION(biases),
604#endif /* defined(HAS_BIAS) */
605 int max_offset)
606{
607 const int x = get_global_id(0); // channels
608 const int y = get_global_id(1); // spatial coordinate x
609 const int z = get_global_id(2); // spatial coordinate y
610
611 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
612
613 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
614
615 int z_coord = 0;
616 int4 offset = 0;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100617 const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100618
619 // We compute 2x1x1 [C,W,H] elements
620 VEC_INT acc = 0, sum = 0;
621
622 // Load weights
623 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
624 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
625 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
626 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
627 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
628 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
629 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
630 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
631 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
632
633#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100634 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
635 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
636 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100637#endif /* INPUT_OFFSET != 0 */
638
639 // Load input values
640 // z == 0
641 // Clamp z_coord as for z = 0, it can be negative
642 // z_coord is casted to unsigned int in order to use just a min() operation
643 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100644 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100645 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
646 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100647 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100648
649 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
650 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
651 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
652
653 // z == 1
654 // z_coord can be only negative for z = 0 so we do not need to clamp it
655 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100656 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100657 offset = y_offset + (int4)(z_coord * src_stride_z);
658 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
659 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
660 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
661
662 // z == 2
663 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
664 // However offset can be out-of-bound so we need to check if it is greater than max_offset
665 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100666 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100667 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
668 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
669 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
670
671 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
672 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
673 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
674
675 MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
676 MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
677 MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
678
679 MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
680 MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
681 MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
682
683#if defined(HAS_BIAS)
684 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
685 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
686 acc += bias_values;
687#endif // defined(HAS_BIAS)
688
689#if WEIGHTS_OFFSET != 0
690 acc += WEIGHTS_OFFSET * sum;
691#endif /* WEIGHTS_OFFSET != 0 */
692
693#if INPUT_OFFSET != 0
694 acc += INPUT_OFFSET * sum_we;
695#endif /* INPUT_OFFSET != 0 */
696
697#if K_OFFSET != 0
698 acc += (VEC_INT)K_OFFSET;
699#endif /* K_OFFSET != 0 */
700
701 acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
702 acc += (VEC_INT)OUTPUT_OFFSET;
703
704 VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
705 res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
706
707 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
708 VSTORE(VEC_SIZE)
709 (res, 0, dst.ptr);
710}
711#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
712
713#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
714/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1
715 *
716 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
717 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
718 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
719 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
720 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
721 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
722 *
723 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
724 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
725 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
726 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
727 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
728 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
729 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
730 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
731 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
732 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
733 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
734 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
735 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
736 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
737 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
738 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
739 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
740 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
741 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
742 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
743 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
744 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
745 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
746 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
747 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
748 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
749 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
750 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
751 * @param[in] max_offset Max offset for the input tensor
752 */
753
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000754__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
755 TENSOR3D_DECLARATION(src),
756 TENSOR3D_DECLARATION(dst),
757 TENSOR3D_DECLARATION(weights),
758#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100759 VECTOR_DECLARATION(biases),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000760#endif /* defined(HAS_BIAS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100761 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000762{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100763 int x = get_global_id(0);
764 int y = get_global_id(1);
765 int z = get_global_id(2);
766
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000767 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100768
769 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
770
771 int z_coord = 0;
772 int4 offset = 0;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100773 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100774
775 // We compute 2x2x2 [C,W,H] elements
776 VEC_INT acc0 = 0, sum0 = 0;
777 VEC_INT acc1 = 0, sum1 = 0;
778 VEC_INT acc2 = 0, sum2 = 0;
779 VEC_INT acc3 = 0, sum3 = 0;
780
781 // Load weights
782 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
783 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
784 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
785 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
786 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
787 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
788 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
789 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
790 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
791
792#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100793 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
794 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
795 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100796#endif /* INPUT_OFFSET != 0 */
797
798 // Load input values
799 // z == 0
800 // Clamp z_coord as for z = 0, it can be negative
801 // z_coord is casted to unsigned int in order to use just a min() operation
802 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100803 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100804 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
805 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100806 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100807
808 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
809 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
810 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
811 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
812
813 // z == 1
814 // z_coord can be only negative for z = 0 so we do not need to clamp it
815 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100816 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100817 offset = y_offset + (int4)(z_coord * src_stride_z);
818 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
819 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
820 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
821 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
822
823 // z == 2
824 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
825 // However offset can be out-of-bound so we need to check if it is greater than max_offset
826 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100827 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100828 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
829 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
830 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
831 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
832
833 // z == 3
834 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
835 // However offset can be out-of-bound so we need to check if it is greater than max_offset
836 offset += (int4)(src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100837 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100838 VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
839 VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
840 VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
841 VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
842
843 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
844 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
845 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
846 MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
847 MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
848 MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
849
850 MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
851 MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
852 MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
853 MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
854 MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
855 MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
856
857 MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
858 MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
859 MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
860 MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
861 MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
862 MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
863
864 MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
865 MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
866 MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
867 MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
868 MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
869 MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
870
871 MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
872 MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
873 MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
874 MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
875 MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
876 MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
877
878 MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
879 MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
880 MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
881 MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
882 MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
883 MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
884
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000885#if defined(HAS_BIAS)
886 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
887
888 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000889
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000890 acc0 += bias_values;
891 acc1 += bias_values;
892 acc2 += bias_values;
893 acc3 += bias_values;
894#endif /* defined(HAS_BIAS) */
895
896#if WEIGHTS_OFFSET != 0
897 acc0 += WEIGHTS_OFFSET * sum0;
898 acc1 += WEIGHTS_OFFSET * sum1;
899 acc2 += WEIGHTS_OFFSET * sum2;
900 acc3 += WEIGHTS_OFFSET * sum3;
901#endif /* WEIGHTS_OFFSET != 0 */
902
903#if INPUT_OFFSET != 0
904 VEC_INT offs = INPUT_OFFSET * sum_we;
905
906 acc0 += offs;
907 acc1 += offs;
908 acc2 += offs;
909 acc3 += offs;
910#endif /* INPUT_OFFSET != 0 */
911
912#if K_OFFSET != 0
913 acc0 += (VEC_INT)K_OFFSET;
914 acc1 += (VEC_INT)K_OFFSET;
915 acc2 += (VEC_INT)K_OFFSET;
916 acc3 += (VEC_INT)K_OFFSET;
917#endif /* K_OFFSET != 0 */
918
919 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
920 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
921 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
922 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
923
924 acc0 += (VEC_INT)OUTPUT_OFFSET;
925 acc1 += (VEC_INT)OUTPUT_OFFSET;
926 acc2 += (VEC_INT)OUTPUT_OFFSET;
927 acc3 += (VEC_INT)OUTPUT_OFFSET;
928
929 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
930 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
931 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
932 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
933
934 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
935 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
936 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
937 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
938
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100939 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
940
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000941 VSTORE(VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100942 (res0, 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000943 VSTORE(VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100944 (res1, 0, dst_addr + 1 * dst_stride_y);
945
946#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
947 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
948#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
949 {
950 VSTORE(VEC_SIZE)
951 (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
952 VSTORE(VEC_SIZE)
953 (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
954 }
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000955}
956
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100957#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
958/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
959 *
960 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
961 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
962 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
963 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
964 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
965 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000966 *
967 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
968 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
969 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
970 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
971 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
972 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
973 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
974 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
975 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
976 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
977 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
978 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
979 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
980 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
981 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
982 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
983 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
984 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
985 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
986 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
987 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
988 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
989 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
990 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
991 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
992 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
993 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
994 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
995 */
996
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100997__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000998 TENSOR3D_DECLARATION(src),
999 TENSOR3D_DECLARATION(dst),
1000 TENSOR3D_DECLARATION(weights),
1001#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001002 VECTOR_DECLARATION(biases),
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001003#endif /* defined(HAS_BIAS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001004 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001005{
Giorgio Arenafa23f112018-06-19 11:27:38 +01001006 int x = get_global_id(0);
1007 int y = get_global_id(1);
1008 int z = get_global_id(2);
1009
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001010 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001011
1012 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
1013
1014 int z_coord = 0;
1015 int4 offset = 0;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001016 int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001017
1018 // We compute 2x2x2 [C,W,H] elements
1019 VEC_INT acc0 = 0, sum0 = 0;
1020 VEC_INT acc1 = 0, sum1 = 0;
1021 VEC_INT acc2 = 0, sum2 = 0;
1022 VEC_INT acc3 = 0, sum3 = 0;
1023
1024 // Load weights
1025 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
1026 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
1027 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
1028 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
1029 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
1030 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
1031 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
1032 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
1033 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
1034
1035#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001036 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
1037 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
1038 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001039#endif /* INPUT_OFFSET != 0 */
1040
1041 // Load input values
1042 // z == 0
1043 // Clamp z_coord as for z = 0, it can be negative
1044 // z_coord is casted to unsigned int in order to use just a min() operation
1045 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001046 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001047 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1048 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001049 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001050
1051 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1052 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1053 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1054 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1055
1056 // z == 1
1057 // z_coord can be only negative for z = 0 so we do not need to clamp it
1058 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001059 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001060 offset = y_offset + (int4)(z_coord * src_stride_z);
1061 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1062 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1063 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1064 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1065
1066 // z == 2
1067 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1068 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1069 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001070 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001071 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1072 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1073 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1074 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1075
1076 // z == 3
1077 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1078 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1079 offset += (int4)(src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001080 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001081 VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1082 VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1083 VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1084 VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1085
1086 DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
1087 DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
1088 DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8);
1089 DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8);
1090
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001091#if defined(HAS_BIAS)
1092 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1093
1094 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001095
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001096 acc0 += bias_values;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001097 acc1 += bias_values;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001098 acc2 += bias_values;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001099 acc3 += bias_values;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001100#endif /* defined(HAS_BIAS) */
1101
1102#if WEIGHTS_OFFSET != 0
1103 acc0 += WEIGHTS_OFFSET * sum0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001104 acc1 += WEIGHTS_OFFSET * sum1;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001105 acc2 += WEIGHTS_OFFSET * sum2;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001106 acc3 += WEIGHTS_OFFSET * sum3;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001107#endif /* WEIGHTS_OFFSET != 0 */
1108
1109#if INPUT_OFFSET != 0
1110 VEC_INT offs = INPUT_OFFSET * sum_we;
1111
1112 acc0 += offs;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001113 acc1 += offs;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001114 acc2 += offs;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001115 acc3 += offs;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001116#endif /* INPUT_OFFSET != 0 */
1117
1118#if K_OFFSET != 0
1119 acc0 += (VEC_INT)K_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001120 acc1 += (VEC_INT)K_OFFSET;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001121 acc2 += (VEC_INT)K_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001122 acc3 += (VEC_INT)K_OFFSET;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001123#endif /* K_OFFSET != 0 */
1124
1125 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001126 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001127 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001128 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1129
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001130 acc0 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001131 acc1 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001132 acc2 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001133 acc3 += (VEC_INT)OUTPUT_OFFSET;
1134
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001135 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001136 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001137 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001138 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
1139
1140 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1141 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
1142 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
1143 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
1144
1145 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001146
1147 VSTORE(VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001148 (res0, 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001149 VSTORE(VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001150 (res1, 0, dst_addr + 1 * dst_stride_y);
1151
1152#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1153 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1154#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1155 {
1156 VSTORE(VEC_SIZE)
1157 (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
1158 VSTORE(VEC_SIZE)
1159 (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
1160 }
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001161}
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001162#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001163
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001164#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001165
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001166#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
1167
1168#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)