blob: 5a732b486369d482868247c0960cc6e051bc01a7 [file] [log] [blame]
Dmitry Savenkod7295b72017-11-20 22:00:08 +07001/*
Giorgio Arena944d3f72018-01-16 15:38:35 +00002 * Copyright (c) 2017-2018 ARM Limited.
Dmitry Savenkod7295b72017-11-20 22:00:08 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers_asymm.h"
26
Gian Marco Iodice4b908652018-10-18 10:21:02 +010027#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
Giorgio Arena287b5702018-02-16 11:01:04 +000028
Giorgio Arena99ac60b2018-02-16 15:17:23 +000029#if defined(FUSED_ACTIVATION)
30#define DATA_TYPE uchar
Giorgio Arenadfca60b2018-01-31 10:30:59 +000031#ifndef VEC_SIZE
Giorgio Arena99ac60b2018-02-16 15:17:23 +000032#define VEC_SIZE 8
Giorgio Arenadfca60b2018-01-31 10:30:59 +000033#endif /* VEC_SIZE */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000034#include "activation_layer_qa8.cl"
35#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
36#else /* defined(FUSED_ACTIVATION) */
37#define ACTIVATION_FUNC(x) (x)
38#endif /* defined(FUSED_ACTIVATION) */
39
Georgios Pinitasdaa38552018-08-28 17:43:18 +010040#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
41#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010042#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
Georgios Pinitasdaa38552018-08-28 17:43:18 +010043#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010044#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010045#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
46#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +010047
Georgios Pinitase55b40a2018-09-13 17:20:04 +010048#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +000049
Giorgio Arena287b5702018-02-16 11:01:04 +000050#if CONV_STRIDE_X > 3
51#error "Stride X not supported"
52#endif /* CONV_STRIDE_X > 3 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +070053
Georgios Pinitasdaa38552018-08-28 17:43:18 +010054#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Giorgio Arenaeff8d952018-07-02 15:29:57 +010055
Dmitry Savenkod7295b72017-11-20 22:00:08 +070056#if CONV_STRIDE_X == 1
Giorgio Arena287b5702018-02-16 11:01:04 +000057#define GET_VALUES(first_value, left, middle, right) \
58 ({ \
59 int8 temp0 = CONVERT(vload8(0, first_value), int8); \
60 int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
61 \
62 left = CONVERT(temp0.s01234567, int8); \
63 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
64 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
65 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070066#elif CONV_STRIDE_X == 2
Giorgio Arena287b5702018-02-16 11:01:04 +000067#define GET_VALUES(first_value, left, middle, right) \
68 ({ \
69 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
70 int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
71 \
72 left = CONVERT(temp0.s02468ace, int8); \
73 middle = CONVERT(temp0.s13579bdf, int8); \
74 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
75 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070076#else /* CONV_STRIDE_X */
Giorgio Arena287b5702018-02-16 11:01:04 +000077#define GET_VALUES(first_value, left, middle, right) \
78 ({ \
79 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
80 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
81 \
82 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
83 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
84 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
85 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070086#endif /* CONV_STRIDE_X */
87
Giorgio Arenadfca60b2018-01-31 10:30:59 +000088/** This function computes the depthwise convolution quantized.
Anthony Barbierf202e502017-11-23 18:02:04 +000089 *
Georgios Pinitas37044642018-10-30 14:53:25 +000090 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
91 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +000092 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +000093 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +000094 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +000095 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
96 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +000097 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +000098 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
99 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
100 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
102 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
104 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
105 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
106 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
107 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
108 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
109 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
110 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
111 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
112 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
113 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
114 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
115 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
116 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
117 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Anthony Barbierf202e502017-11-23 18:02:04 +0000118 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700119
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000120__kernel void depthwise_convolution_3x3_quantized_nchw(
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700121 TENSOR3D_DECLARATION(src),
122 TENSOR3D_DECLARATION(dst),
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000123 TENSOR3D_DECLARATION(weights)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700124#if defined(HAS_BIAS)
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000125 ,
Giorgio Arena287b5702018-02-16 11:01:04 +0000126 VECTOR_DECLARATION(biases)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700127#endif //defined(HAS_BIAS)
Giorgio Arena287b5702018-02-16 11:01:04 +0000128)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700129{
130 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
131 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100132 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
133
134 // Extract channel and linearized batch indices
135 const int channel = get_global_id(2) % DST_CHANNELS;
136 const int batch = get_global_id(2) / DST_CHANNELS;
137
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700138#if defined(HAS_BIAS)
139 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700140
Georgios Pinitas728d3cf2018-09-21 13:41:35 +0100141 int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700142#endif //defined(HAS_BIAS)
143
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100144 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
145 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
146 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100147
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100148 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
149 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
150 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700151
Giorgio Arena287b5702018-02-16 11:01:04 +0000152 int8 values0 = 0;
153 int8 sum0 = 0;
154#if CONV_STRIDE_Y == 1
155 int8 values1 = 0;
156 int8 sum1 = 0;
157#endif /* CONV_STRIDE_Y */
158
159 // Row0
160 int8 left, middle, right;
161 GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
162 values0 += left * (int8)(w0.s0);
163 values0 += middle * (int8)(w0.s1);
164 values0 += right * (int8)(w0.s2);
165
166#if WEIGHTS_OFFSET != 0
167 sum0 += left + middle + right;
168#endif /* WEIGHTS_OFFSET != 0 */
169
170 // Row1
171 GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
172 values0 += left * (int8)(w1.s0);
173 values0 += middle * (int8)(w1.s1);
174 values0 += right * (int8)(w1.s2);
175#if CONV_STRIDE_Y == 1
176 values1 += left * (int8)(w0.s0);
177 values1 += middle * (int8)(w0.s1);
178 values1 += right * (int8)(w0.s2);
179#endif /* CONV_STRIDE_Y == 1 */
180
181#if WEIGHTS_OFFSET != 0
182 int8 tmp = left + middle + right;
183 sum0 += tmp;
184#if CONV_STRIDE_Y == 1
185 sum1 += tmp;
186#endif /* CONV_STRIDE_Y == 1 */
187#endif /* WEIGHTS_OFFSET != 0 */
188
189 // Row2
190 GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
191 values0 += left * (int8)(w2.s0);
192 values0 += middle * (int8)(w2.s1);
193 values0 += right * (int8)(w2.s2);
194#if CONV_STRIDE_Y == 1
195 values1 += left * (int8)(w1.s0);
196 values1 += middle * (int8)(w1.s1);
197 values1 += right * (int8)(w1.s2);
198#endif /* CONV_STRIDE_Y == 1 */
199
200#if WEIGHTS_OFFSET != 0
201 tmp = left + middle + right;
202 sum0 += tmp;
203#if CONV_STRIDE_Y == 1
204 sum1 += tmp;
205#endif /* CONV_STRIDE_Y == 1 */
206#endif /* WEIGHTS_OFFSET != 0 */
207
208#if CONV_STRIDE_Y == 1
209 // Row3
210 GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
211 values1 += left * (int8)(w2.s0);
212 values1 += middle * (int8)(w2.s1);
213 values1 += right * (int8)(w2.s2);
214
215#if WEIGHTS_OFFSET != 0
216 sum1 += left + middle + right;
217#endif /* WEIGHTS_OFFSET != 0 */
218#endif /* CONV_STRIDE_Y == 1 */
219
220#if defined(HAS_BIAS)
221 values0 += (int8)(bias_value);
222#if CONV_STRIDE_Y == 1
223 values1 += (int8)(bias_value);
224#endif /* CONV_STRIDE_Y == 1 */
225#endif //defined(HAS_BIAS)
226
227#if WEIGHTS_OFFSET != 0
228 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
229#if CONV_STRIDE_Y == 1
230 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
231#endif /* CONV_STRIDE_Y == 1 */
232#endif /* WEIGHTS_OFFSET != 0 */
233
234#if INPUT_OFFSET != 0
235 ushort sum_weights = 0;
236 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
237 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
238 values0 += sum_weights * (int8)(INPUT_OFFSET);
239#if CONV_STRIDE_Y == 1
240 values1 += sum_weights * (int8)(INPUT_OFFSET);
241#endif /* CONV_STRIDE_Y == 1 */
242#endif /* INPUT_OFFSET != 0 */
243
244#if K_OFFSET != 0
245 values0 += (int8)(K_OFFSET);
246#if CONV_STRIDE_Y == 1
247 values1 += (int8)(K_OFFSET);
248#endif /* CONV_STRIDE_Y == 1 */
249#endif /* K_OFFSET != 0 */
250
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100251#if defined(REAL_MULTIPLIER)
252
253 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
254
255#else // defined(REAL_MULTIPLIER)
256
Giorgio Arena287b5702018-02-16 11:01:04 +0000257 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100258
259#endif // defined(REAL_MULTIPLIER)
260
Giorgio Arena287b5702018-02-16 11:01:04 +0000261 values0 += (int8)OUTPUT_OFFSET;
262 uchar8 res0 = convert_uchar8_sat(values0);
263 res0 = max(res0, (uchar8)0);
264 res0 = min(res0, (uchar8)255);
265
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000266 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Giorgio Arena287b5702018-02-16 11:01:04 +0000267#if CONV_STRIDE_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100268#if defined(REAL_MULTIPLIER)
269
270 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
271
272#else // defined(REAL_MULTIPLIER)
Giorgio Arena287b5702018-02-16 11:01:04 +0000273
274 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100275
276#endif // defined(REAL_MULTIPLIER)
277
Giorgio Arena287b5702018-02-16 11:01:04 +0000278 values1 += (int8)OUTPUT_OFFSET;
279 uchar8 res1 = convert_uchar8_sat(values1);
280 res1 = max(res1, (uchar8)0);
281 res1 = min(res1, (uchar8)255);
282
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000283 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Giorgio Arena287b5702018-02-16 11:01:04 +0000284#endif /* CONV_STRIDE_Y == 1 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700285}
Giorgio Arena287b5702018-02-16 11:01:04 +0000286
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100287#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000288
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100289#if CONV_STRIDE_X == 1
290#define GET_VALUES(first_value, left, middle, right) \
291 ({ \
292 uchar8 temp0 = vload8(0, first_value); \
293 uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \
294 \
295 left = temp0.s01234567; \
296 middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \
297 right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000298 })
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100299#elif CONV_STRIDE_X == 2
300#define GET_VALUES(first_value, left, middle, right) \
301 ({ \
302 uchar16 temp0 = vload16(0, first_value); \
303 uchar temp1 = *(first_value + 16 * sizeof(uchar)); \
304 \
305 left = temp0.s02468ace; \
306 middle = temp0.s13579bdf; \
307 right = (uchar8)(temp0.s2468, temp0.sace, temp1); \
308 })
309#else /* CONV_STRIDE_X */
310#define GET_VALUES(first_value, left, middle, right) \
311 ({ \
312 uchar16 temp0 = vload16(0, first_value); \
313 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
314 \
315 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
316 middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \
317 right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \
318 })
319#endif /* CONV_STRIDE_X */
320/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000321 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000322 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
323 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000324 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000325 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000326 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000327 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
328 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000329 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000330 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
331 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
332 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
333 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
334 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
335 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
336 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
337 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
338 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
339 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
340 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
341 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
342 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
343 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
344 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
345 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
346 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
347 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
348 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
349 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
350 */
351
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100352__kernel void depthwise_convolution_3x3_quantized_dot8_nchw(
353 TENSOR3D_DECLARATION(src),
354 TENSOR3D_DECLARATION(dst),
355 TENSOR3D_DECLARATION(weights)
356#if defined(HAS_BIAS)
357 ,
358 VECTOR_DECLARATION(biases)
359#endif //defined(HAS_BIAS)
360)
361{
362 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
363 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100364 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100365
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100366 // Extract channel and linearized batch indices
367 const int channel = get_global_id(2) % DST_CHANNELS;
368 const int batch = get_global_id(2) / DST_CHANNELS;
369
370#if defined(HAS_BIAS)
371 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
372
373 const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100374#endif //defined(HAS_BIAS)
375
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100376 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
377 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
378 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100379
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100380 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
381 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
382 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100383
384 uchar8 left0, middle0, right0;
385 uchar8 left1, middle1, right1;
386 uchar8 left2, middle2, right2;
387
388 int8 values0 = 0;
389 int8 sum0 = 0;
390
391 GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
392 GET_VALUES(src.ptr + 1 * src_stride_y, left1, middle1, right1);
393 GET_VALUES(src.ptr + 2 * src_stride_y, left2, middle2, right2);
394
395#if WEIGHTS_OFFSET != 0
396 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
397 sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
398 sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
399#endif /* WEIGHTS_OFFSET != 0 */
400
401#if CONV_STRIDE_Y == 1
402 // If conv_stride_y is equals to 1, we compute two output rows
403
404 uchar8 left3, middle3, right3;
405 int8 values1 = 0;
406 int8 sum1 = 0;
407
408 GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
409
410#if WEIGHTS_OFFSET != 0
411 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
412 sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
413 sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
414#endif /* WEIGHTS_OFFSET != 0 */
415#endif // CONV_STRIDE_Y == 1
416
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100417 ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
418 ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100419 values0.s0 += right2.s0 * w2.s2;
420
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100421 ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
422 ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100423 values0.s1 += right2.s1 * w2.s2;
424
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100425 ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
426 ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100427 values0.s2 += right2.s2 * w2.s2;
428
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100429 ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
430 ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100431 values0.s3 += right2.s3 * w2.s2;
432
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100433 ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
434 ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100435 values0.s4 += right2.s4 * w2.s2;
436
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100437 ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
438 ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100439 values0.s5 += right2.s5 * w2.s2;
440
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100441 ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
442 ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100443 values0.s6 += right2.s6 * w2.s2;
444
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100445 ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
446 ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100447 values0.s7 += right2.s7 * w2.s2;
448
449#if CONV_STRIDE_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100450 ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
451 ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100452 values1.s0 += right3.s0 * w2.s2;
453
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100454 ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
455 ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100456 values1.s1 += right3.s1 * w2.s2;
457
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100458 ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
459 ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100460 values1.s2 += right3.s2 * w2.s2;
461
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100462 ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
463 ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100464 values1.s3 += right3.s3 * w2.s2;
465
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100466 ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
467 ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100468 values1.s4 += right3.s4 * w2.s2;
469
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100470 ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
471 ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100472 values1.s5 += right3.s5 * w2.s2;
473
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100474 ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
475 ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100476 values1.s6 += right3.s6 * w2.s2;
477
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100478 ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
479 ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100480 values1.s7 += right3.s7 * w2.s2;
481#endif // CONV_STRIDE_Y == 1
482
483#if defined(HAS_BIAS)
484 values0 += (int8)(bias_value);
485#if CONV_STRIDE_Y == 1
486 values1 += (int8)(bias_value);
487#endif /* CONV_STRIDE_Y == 1 */
488#endif //defined(HAS_BIAS)
489
490#if WEIGHTS_OFFSET != 0
491 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
492#if CONV_STRIDE_Y == 1
493 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
494#endif /* CONV_STRIDE_Y == 1 */
495#endif /* WEIGHTS_OFFSET != 0 */
496
497#if INPUT_OFFSET != 0
498 ushort sum_weights = 0;
499 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
500 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
501 values0 += sum_weights * (int8)(INPUT_OFFSET);
502#if CONV_STRIDE_Y == 1
503 values1 += sum_weights * (int8)(INPUT_OFFSET);
504#endif /* CONV_STRIDE_Y == 1 */
505#endif /* INPUT_OFFSET != 0 */
506
507#if K_OFFSET != 0
508 values0 += (int8)(K_OFFSET);
509#if CONV_STRIDE_Y == 1
510 values1 += (int8)(K_OFFSET);
511#endif /* CONV_STRIDE_Y == 1 */
512#endif /* K_OFFSET != 0 */
513
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100514#if defined(REAL_MULTIPLIER)
515
516 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
517
518#else // defined(REAL_MULTIPLIER)
519
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100520 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100521
522#endif // defined(REAL_MULTIPLIER)
523
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100524 values0 += (int8)OUTPUT_OFFSET;
525 uchar8 res0 = convert_uchar8_sat(values0);
526 res0 = max(res0, (uchar8)0);
527 res0 = min(res0, (uchar8)255);
528
529 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
530#if CONV_STRIDE_Y == 1
531
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100532#if defined(REAL_MULTIPLIER)
533
534 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
535
536#else // defined(REAL_MULTIPLIER)
537
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100538 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100539
540#endif // defined(REAL_MULTIPLIER)
541
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100542 values1 += (int8)OUTPUT_OFFSET;
543 uchar8 res1 = convert_uchar8_sat(values1);
544 res1 = max(res1, (uchar8)0);
545 res1 = min(res1, (uchar8)255);
546
547 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
548#endif /* CONV_STRIDE_Y == 1 */
549}
550
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100551#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100552
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100553#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100554
555#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
556
557#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
558
559#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100560#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100561#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
562#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
563
564#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)
565
566#if WEIGHTS_OFFSET != 0
567#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
568 ({ \
569 sum += CONVERT(x, VEC_INT); \
570 MULTIPLY_ADD(x, y, acc); \
571 })
572#else /* WEIGHTS_OFFSET != 0 */
573#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
574#endif /* WEIGHTS_OFFSET != 0 */
575
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100576#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100577#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
578 ({ \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100579 ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0); \
580 ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0); \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100581 acc.s0 += val8.s0 * w8.s0; \
582 \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100583 ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1); \
584 ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1); \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100585 acc.s1 += val8.s1 * w8.s1; \
586 \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100587 ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2); \
588 ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2); \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100589 acc.s2 += val8.s2 * w8.s2; \
590 \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100591 ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3); \
592 ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3); \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100593 acc.s3 += val8.s3 * w8.s3; \
594 })
595
596#if WEIGHTS_OFFSET != 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100597#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
598 ({ \
599 ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0); \
600 ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0); \
601 ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0); \
602 \
603 ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1); \
604 ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1); \
605 ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1); \
606 \
607 ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2); \
608 ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2); \
609 ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2); \
610 \
611 ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3); \
612 ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3); \
613 ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3); \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100614 })
615#else /* WEIGHTS_OFFSET != 0 */
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100616#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100617#endif /* WEIGHTS_OFFSET != 0 */
618
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100619#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
620 ({ \
621 sum = CONVERT(val0, VEC_INT); \
622 ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0); \
623 ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0); \
624 \
625 ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1); \
626 ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1); \
627 \
628 ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2); \
629 ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2); \
630 \
631 ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3); \
632 ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3); \
633 })
634
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100635#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100636
637#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
638/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
639 *
640 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
641 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
642 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
643 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
644 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
645 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
646 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000647 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
648 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100649 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000650 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100651 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100652 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000653 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
654 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
655 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
656 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100657 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
658 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
659 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
660 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
661 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
662 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
663 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000664 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
665 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100666 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
667 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
668 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
669 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
670 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
671 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
672 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
673 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
674 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
675 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
676 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
677 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
678 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
679 * @param[in] max_offset Max offset for the input tensor
680 */
681__kernel void depthwise_convolution_3x3_quantized_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +0000682 TENSOR4D_DECLARATION(src),
683 TENSOR4D_DECLARATION(dst),
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100684 TENSOR3D_DECLARATION(weights),
685#if defined(HAS_BIAS)
686 VECTOR_DECLARATION(biases),
687#endif /* defined(HAS_BIAS) */
688 int max_offset)
689{
690 const int x = get_global_id(0); // channels
691 const int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +0000692#if defined(DST_DEPTH)
693 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
694 int b = get_global_id(2) / (int)DST_DEPTH; // batch
695#else /* defined(DST_DEPTH) */
696 int z = get_global_id(2); // spatial coordinate y
697#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100698
699 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
700
Georgios Pinitas37044642018-10-30 14:53:25 +0000701#if defined(DST_DEPTH)
702 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
703#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100704 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000705#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100706
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100707 int z_coord = 0;
708 int4 offset = 0;
709 int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100710
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100711 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
712 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
713 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
714 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
715 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
716
717 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
718
719 // We compute 4x1x1 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100720 VEC_INT acc = 0, sum = 0;
721
722 // Load weights
723 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
724 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
725 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
726 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
727 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
728 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
729 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
730 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
731 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
732
733#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100734 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
735 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
736 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100737#endif /* INPUT_OFFSET != 0 */
738
739 // Load input values
740 // z == 0
741 // Clamp z_coord as for z = 0, it can be negative
742 // z_coord is casted to unsigned int in order to use just a min() operation
743 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100744 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100745 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
746 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100747 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100748
749 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
750 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
751 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
752
753 // z == 1
754 // z_coord can be only negative for z = 0 so we do not need to clamp it
755 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100756 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100757 offset = y_offset + (int4)(z_coord * src_stride_z);
758 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
759 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
760 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
761
762 // z == 2
763 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
764 // However offset can be out-of-bound so we need to check if it is greater than max_offset
765 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100766 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100767 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
768 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
769 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
770
771 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
772 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
773 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
774
775 MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
776 MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
777 MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
778
779 MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
780 MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
781 MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
782
783#if defined(HAS_BIAS)
784 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
785 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
786 acc += bias_values;
787#endif // defined(HAS_BIAS)
788
789#if WEIGHTS_OFFSET != 0
790 acc += WEIGHTS_OFFSET * sum;
791#endif /* WEIGHTS_OFFSET != 0 */
792
793#if INPUT_OFFSET != 0
794 acc += INPUT_OFFSET * sum_we;
795#endif /* INPUT_OFFSET != 0 */
796
797#if K_OFFSET != 0
798 acc += (VEC_INT)K_OFFSET;
799#endif /* K_OFFSET != 0 */
800
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100801#if defined(REAL_MULTIPLIER)
802
803 acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
804
805#else // defined(REAL_MULTIPLIER)
806
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100807 acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100808#endif // defined(REAL_MULTIPLIER)
809
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100810 acc += (VEC_INT)OUTPUT_OFFSET;
811
812 VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
813 res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
814
Georgios Pinitas37044642018-10-30 14:53:25 +0000815#if defined(DST_DEPTH)
816 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
817#else /* defined(DST_DEPTH) */
818 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
819#endif /* defined(DST_DEPTH) */
820
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100821 VSTORE(VEC_SIZE)
Georgios Pinitas37044642018-10-30 14:53:25 +0000822 (ACTIVATION_FUNC(res), 0, dst_addr);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100823}
824#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
825
826#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
827/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1
828 *
829 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
830 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
831 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
832 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
833 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
834 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
835 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000836 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
837 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100838 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000839 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100840 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100841 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000842 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
843 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
844 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
845 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100846 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
847 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
848 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
849 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
850 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
851 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
852 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000853 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
854 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100855 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
856 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
857 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
858 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
859 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
860 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
861 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
862 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
863 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
864 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
865 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
866 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
867 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
868 * @param[in] max_offset Max offset for the input tensor
869 */
870
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000871__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +0000872 TENSOR4D_DECLARATION(src),
873 TENSOR4D_DECLARATION(dst),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000874 TENSOR3D_DECLARATION(weights),
875#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100876 VECTOR_DECLARATION(biases),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000877#endif /* defined(HAS_BIAS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100878 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000879{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100880 int x = get_global_id(0);
881 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +0000882#if defined(DST_DEPTH)
883 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
884 int b = get_global_id(2) / (int)DST_DEPTH; // batch
885#else /* defined(DST_DEPTH) */
886 int z = get_global_id(2); // spatial coordinate y
887#endif /* defined(DST_DEPTH) */
Giorgio Arenafa23f112018-06-19 11:27:38 +0100888
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000889 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100890
Georgios Pinitas37044642018-10-30 14:53:25 +0000891#if defined(DST_DEPTH)
892 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
893#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100894 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000895#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100896
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100897 int z_coord = 0;
898 int4 offset = 0;
899 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100900
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100901 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
902 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
903 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
904 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
905 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
906
907 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
908
909 // We compute 4x2x2 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100910 VEC_INT acc0 = 0, sum0 = 0;
911 VEC_INT acc1 = 0, sum1 = 0;
912 VEC_INT acc2 = 0, sum2 = 0;
913 VEC_INT acc3 = 0, sum3 = 0;
914
915 // Load weights
916 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
917 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
918 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
919 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
920 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
921 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
922 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
923 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
924 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
925
926#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100927 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
928 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
929 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100930#endif /* INPUT_OFFSET != 0 */
931
932 // Load input values
933 // z == 0
934 // Clamp z_coord as for z = 0, it can be negative
935 // z_coord is casted to unsigned int in order to use just a min() operation
936 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100937 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100938 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
939 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100940 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100941
942 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
943 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
944 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
945 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
946
947 // z == 1
948 // z_coord can be only negative for z = 0 so we do not need to clamp it
949 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100950 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100951 offset = y_offset + (int4)(z_coord * src_stride_z);
952 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
953 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
954 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
955 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
956
957 // z == 2
958 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
959 // However offset can be out-of-bound so we need to check if it is greater than max_offset
960 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100961 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100962 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
963 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
964 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
965 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
966
967 // z == 3
968 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
969 // However offset can be out-of-bound so we need to check if it is greater than max_offset
970 offset += (int4)(src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100971 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100972 VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
973 VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
974 VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
975 VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
976
977 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
978 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
979 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
980 MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
981 MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
982 MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
983
984 MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
985 MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
986 MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
987 MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
988 MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
989 MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
990
991 MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
992 MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
993 MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
994 MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
995 MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
996 MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
997
998 MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
999 MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
1000 MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
1001 MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
1002 MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
1003 MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
1004
1005 MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
1006 MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
1007 MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
1008 MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
1009 MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
1010 MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
1011
1012 MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
1013 MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
1014 MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
1015 MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
1016 MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
1017 MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
1018
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001019#if defined(HAS_BIAS)
1020 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1021
1022 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001023
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001024 acc0 += bias_values;
1025 acc1 += bias_values;
1026 acc2 += bias_values;
1027 acc3 += bias_values;
1028#endif /* defined(HAS_BIAS) */
1029
1030#if WEIGHTS_OFFSET != 0
1031 acc0 += WEIGHTS_OFFSET * sum0;
1032 acc1 += WEIGHTS_OFFSET * sum1;
1033 acc2 += WEIGHTS_OFFSET * sum2;
1034 acc3 += WEIGHTS_OFFSET * sum3;
1035#endif /* WEIGHTS_OFFSET != 0 */
1036
1037#if INPUT_OFFSET != 0
1038 VEC_INT offs = INPUT_OFFSET * sum_we;
1039
1040 acc0 += offs;
1041 acc1 += offs;
1042 acc2 += offs;
1043 acc3 += offs;
1044#endif /* INPUT_OFFSET != 0 */
1045
1046#if K_OFFSET != 0
1047 acc0 += (VEC_INT)K_OFFSET;
1048 acc1 += (VEC_INT)K_OFFSET;
1049 acc2 += (VEC_INT)K_OFFSET;
1050 acc3 += (VEC_INT)K_OFFSET;
1051#endif /* K_OFFSET != 0 */
1052
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001053#if defined(REAL_MULTIPLIER)
1054
1055 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1056 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1057 acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1058 acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1059
1060#else // defined(REAL_MULTIPLIER)
1061
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001062 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1063 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1064 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1065 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1066
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001067#endif // defined(REAL_MULTIPLIER)
1068
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001069 acc0 += (VEC_INT)OUTPUT_OFFSET;
1070 acc1 += (VEC_INT)OUTPUT_OFFSET;
1071 acc2 += (VEC_INT)OUTPUT_OFFSET;
1072 acc3 += (VEC_INT)OUTPUT_OFFSET;
1073
1074 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
1075 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
1076 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
1077 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
1078
1079 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1080 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
1081 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
1082 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
1083
Georgios Pinitas37044642018-10-30 14:53:25 +00001084#if defined(DST_DEPTH)
1085 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1086#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001087 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001088#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001089
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001090 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001091 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001092 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001093 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001094
1095#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1096 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1097#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1098 {
1099 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001100 (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001101 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001102 (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001103 }
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001104}
1105
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001106#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001107/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
1108 *
1109 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1110 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1111 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1112 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1113 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1114 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001115 * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
1116 * If not, the quantization will be performed using a fixed point multiplication
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001117 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001118 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
1119 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001120 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001121 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001122 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001123 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001124 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1125 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1126 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1127 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001128 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
1129 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1130 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1131 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1132 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1133 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1134 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001135 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1136 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001137 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1138 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1139 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1140 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1141 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1142 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1143 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1144 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1145 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1146 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
1147 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1148 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1149 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001150 * @param[in] max_offset The maximum allowed offset for the input tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001151 */
1152
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001153__kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
Georgios Pinitas37044642018-10-30 14:53:25 +00001154 TENSOR4D_DECLARATION(src),
1155 TENSOR4D_DECLARATION(dst),
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001156 TENSOR3D_DECLARATION(weights),
1157#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001158 VECTOR_DECLARATION(biases),
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001159#endif // defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001160 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001161{
Giorgio Arenafa23f112018-06-19 11:27:38 +01001162 int x = get_global_id(0);
1163 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +00001164#if defined(DST_DEPTH)
1165 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1166 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1167#else /* defined(DST_DEPTH) */
1168 int z = get_global_id(2); // spatial coordinate y
1169#endif /* defined(DST_DEPTH) */
Giorgio Arenafa23f112018-06-19 11:27:38 +01001170
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001171 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001172
Georgios Pinitas37044642018-10-30 14:53:25 +00001173#if defined(DST_DEPTH)
1174 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
1175#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001176 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001177#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001178
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001179 int z_coord = 0;
1180 int4 offset = 0;
1181 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001182
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001183 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
1184 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
1185 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
1186 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
1187 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
1188
1189 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
1190
1191 // We compute 4x2x1 [C,W,H] elements
1192 VEC_INT acc0 = 0;
1193 VEC_INT acc1 = 0;
1194 VEC_INT sum0 = 0;
1195 VEC_INT sum1 = 0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001196
1197 // Load weights
1198 VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
1199 VEC_UCHAR w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z);
1200 VEC_UCHAR w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z);
1201 VEC_UCHAR w3 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z);
1202 VEC_UCHAR w4 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z);
1203 VEC_UCHAR w5 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z);
1204 VEC_UCHAR w6 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z);
1205 VEC_UCHAR w7 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z);
1206 VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
1207
1208#if INPUT_OFFSET != 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001209 // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
1210 DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);
1211
1212 // Multiply the weights reduction with INPUT_OFFSET
1213 acc0 = INPUT_OFFSET * acc0;
1214
1215 acc1 = acc0;
1216#endif // INPUT_OFFSET != 0
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001217
1218 // Load input values
1219 // z == 0
1220 // Clamp z_coord as for z = 0, it can be negative
1221 // z_coord is casted to unsigned int in order to use just a min() operation
1222 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001223 z_coord = z - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001224 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1225 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001226 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001227
1228 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1229 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1230 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1231 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1232
1233 // z == 1
1234 // z_coord can be only negative for z = 0 so we do not need to clamp it
1235 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001236 z_coord = z - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001237 offset = y_offset + (int4)(z_coord * src_stride_z);
1238 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1239 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1240 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1241 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1242
1243 // z == 2
1244 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1245 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1246 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001247 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001248 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1249 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1250 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1251 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1252
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001253 DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);
1254 DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001255
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001256 DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);
1257 DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001258
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001259#if defined(HAS_BIAS)
1260 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1261
1262 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001263
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001264 acc0 += bias_values;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001265 acc1 += bias_values;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001266
1267#endif // defined(HAS_BIAS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001268
1269#if WEIGHTS_OFFSET != 0
1270 acc0 += WEIGHTS_OFFSET * sum0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001271 acc1 += WEIGHTS_OFFSET * sum1;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001272#endif // WEIGHTS_OFFSET != 0
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001273
1274#if K_OFFSET != 0
1275 acc0 += (VEC_INT)K_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001276 acc1 += (VEC_INT)K_OFFSET;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001277
1278#endif // K_OFFSET != 0
1279
1280#if defined(REAL_MULTIPLIER)
1281
1282 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1283 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1284
1285#else // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001286
1287 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001288 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001289
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001290#endif // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001291 acc0 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001292 acc1 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001293
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001294 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001295 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001296
1297 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1298 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001299
Georgios Pinitas37044642018-10-30 14:53:25 +00001300#if defined(DST_DEPTH)
1301 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1302#else /* defined(DST_DEPTH) */
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001303 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001304#endif /* defined(DST_DEPTH) */
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001305
1306 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001307 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001308 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001309 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001310}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001311
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001312#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001313
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001314#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001315
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001316#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
1317
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001318#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))