blob: 94373b74e77f6d05e394bb1e29e3baa734c29c85 [file] [log] [blame]
Dmitry Savenkod7295b72017-11-20 22:00:08 +07001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Dmitry Savenkod7295b72017-11-20 22:00:08 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers_asymm.h"
26
Michele Di Giorgioa046e162019-10-08 09:36:26 +010027#ifndef VEC_SIZE
28#if defined(N0)
29#define VEC_SIZE N0
30#else /* defined(N0) */
31#define VEC_SIZE 8
32#endif /* defined(N0) */
33#endif /* VEC_SIZE */
Giorgio Arena287b5702018-02-16 11:01:04 +000034
Usama Arif6a98a6e2019-05-10 17:07:27 +010035#if defined(ACTIVATION_TYPE) && defined(CONST_0)
Giorgio Arena99ac60b2018-02-16 15:17:23 +000036#define DATA_TYPE uchar
Manuel Bottini30dbeef2019-06-26 16:23:03 +010037#include "activation_layer_quant.cl"
38#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
Usama Arif6a98a6e2019-05-10 17:07:27 +010039#else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000040#define ACTIVATION_FUNC(x) (x)
Usama Arif6a98a6e2019-05-10 17:07:27 +010041#endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000042
Michele Di Giorgioa046e162019-10-08 09:36:26 +010043#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
44#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
45#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
46#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
47#define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE)
48
49#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
50
Georgios Pinitasdaa38552018-08-28 17:43:18 +010051#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
52#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010053#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
Georgios Pinitasdaa38552018-08-28 17:43:18 +010054#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010055#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010056#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
57#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +010058
Georgios Pinitase55b40a2018-09-13 17:20:04 +010059#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +000060
Giorgio Arena287b5702018-02-16 11:01:04 +000061#if CONV_STRIDE_X > 3
62#error "Stride X not supported"
63#endif /* CONV_STRIDE_X > 3 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +070064
Georgios Pinitasdaa38552018-08-28 17:43:18 +010065#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Giorgio Arenaeff8d952018-07-02 15:29:57 +010066
Usama Arife73686a2019-04-08 17:30:48 +010067#if DILATION_X == 1
68
Dmitry Savenkod7295b72017-11-20 22:00:08 +070069#if CONV_STRIDE_X == 1
Giorgio Arena287b5702018-02-16 11:01:04 +000070#define GET_VALUES(first_value, left, middle, right) \
71 ({ \
72 int8 temp0 = CONVERT(vload8(0, first_value), int8); \
73 int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
74 \
75 left = CONVERT(temp0.s01234567, int8); \
76 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
77 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
78 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070079#elif CONV_STRIDE_X == 2
Giorgio Arena287b5702018-02-16 11:01:04 +000080#define GET_VALUES(first_value, left, middle, right) \
81 ({ \
82 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
83 int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
84 \
85 left = CONVERT(temp0.s02468ace, int8); \
86 middle = CONVERT(temp0.s13579bdf, int8); \
87 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
88 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070089#else /* CONV_STRIDE_X */
Giorgio Arena287b5702018-02-16 11:01:04 +000090#define GET_VALUES(first_value, left, middle, right) \
91 ({ \
92 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
93 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
94 \
95 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
96 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
97 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
98 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070099#endif /* CONV_STRIDE_X */
100
Usama Arife73686a2019-04-08 17:30:48 +0100101#else /* DILATION_X == 1 */
102
103#if CONV_STRIDE_X == 1
104#define GET_VALUES(first_value, left, middle, right) \
105 ({ \
106 left = CONVERT(vload8(0, first_value), int8); \
107 middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8); \
108 right = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \
109 })
110#elif CONV_STRIDE_X == 2
111#define GET_VALUES(first_value, left, middle, right) \
112 ({ \
113 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
114 left = CONVERT(temp0.s02468ace, int8); \
115 \
116 temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
117 middle = CONVERT(temp0.s02468ace, int8); \
118 \
119 temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
120 right = CONVERT(temp0.s02468ace, int8); \
121 })
122#else /* CONV_STRIDE_X */
123#define GET_VALUES(first_value, left, middle, right) \
124 ({ \
125 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
126 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
127 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
128 \
129 temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
130 temp1 = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8); \
131 middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
132 \
133 temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
134 temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \
135 right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
136 })
137
138#endif /* CONV_STRIDE_X */
139#endif /* DILATION_X==1 */
140
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000141/** This function computes the depthwise convolution quantized.
Anthony Barbierf202e502017-11-23 18:02:04 +0000142 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000143 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
144 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000145 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000146 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000147 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000148 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
149 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000150 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000151 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
152 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
153 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
154 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
155 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
156 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
157 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
158 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
159 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
160 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
161 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
162 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
163 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
164 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
165 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
166 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
167 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
168 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
169 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
170 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Anthony Barbierf202e502017-11-23 18:02:04 +0000171 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700172
Pablo Tello47104362019-02-27 13:32:51 +0000173__kernel void dwc_3x3_native_qasymm8_nchw(
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700174 TENSOR3D_DECLARATION(src),
175 TENSOR3D_DECLARATION(dst),
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000176 TENSOR3D_DECLARATION(weights)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700177#if defined(HAS_BIAS)
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000178 ,
Giorgio Arena287b5702018-02-16 11:01:04 +0000179 VECTOR_DECLARATION(biases)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700180#endif //defined(HAS_BIAS)
Giorgio Arena287b5702018-02-16 11:01:04 +0000181)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700182{
183 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
184 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100185 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
186
187 // Extract channel and linearized batch indices
188 const int channel = get_global_id(2) % DST_CHANNELS;
189 const int batch = get_global_id(2) / DST_CHANNELS;
190
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700191#if defined(HAS_BIAS)
192 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700193
Georgios Pinitas728d3cf2018-09-21 13:41:35 +0100194 int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700195#endif //defined(HAS_BIAS)
196
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100197 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
198 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
199 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100200
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100201 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
202 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
203 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700204
Giorgio Arena287b5702018-02-16 11:01:04 +0000205 int8 values0 = 0;
206 int8 sum0 = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100207#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000208 int8 values1 = 0;
209 int8 sum1 = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100210#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000211
212 // Row0
213 int8 left, middle, right;
214 GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
215 values0 += left * (int8)(w0.s0);
216 values0 += middle * (int8)(w0.s1);
217 values0 += right * (int8)(w0.s2);
218
219#if WEIGHTS_OFFSET != 0
220 sum0 += left + middle + right;
221#endif /* WEIGHTS_OFFSET != 0 */
222
223 // Row1
Usama Arife73686a2019-04-08 17:30:48 +0100224 GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
Giorgio Arena287b5702018-02-16 11:01:04 +0000225 values0 += left * (int8)(w1.s0);
226 values0 += middle * (int8)(w1.s1);
227 values0 += right * (int8)(w1.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100228#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000229 values1 += left * (int8)(w0.s0);
230 values1 += middle * (int8)(w0.s1);
231 values1 += right * (int8)(w0.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100232#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000233
234#if WEIGHTS_OFFSET != 0
235 int8 tmp = left + middle + right;
236 sum0 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100237#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000238 sum1 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100239#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000240#endif /* WEIGHTS_OFFSET != 0 */
241
242 // Row2
Usama Arife73686a2019-04-08 17:30:48 +0100243 GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
Giorgio Arena287b5702018-02-16 11:01:04 +0000244 values0 += left * (int8)(w2.s0);
245 values0 += middle * (int8)(w2.s1);
246 values0 += right * (int8)(w2.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100247#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000248 values1 += left * (int8)(w1.s0);
249 values1 += middle * (int8)(w1.s1);
250 values1 += right * (int8)(w1.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100251#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000252
253#if WEIGHTS_OFFSET != 0
254 tmp = left + middle + right;
255 sum0 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100256#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000257 sum1 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100258#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000259#endif /* WEIGHTS_OFFSET != 0 */
260
Usama Arife73686a2019-04-08 17:30:48 +0100261#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000262 // Row3
263 GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
264 values1 += left * (int8)(w2.s0);
265 values1 += middle * (int8)(w2.s1);
266 values1 += right * (int8)(w2.s2);
267
268#if WEIGHTS_OFFSET != 0
269 sum1 += left + middle + right;
270#endif /* WEIGHTS_OFFSET != 0 */
Usama Arife73686a2019-04-08 17:30:48 +0100271#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000272
273#if defined(HAS_BIAS)
274 values0 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100275#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000276 values1 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100277#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000278#endif //defined(HAS_BIAS)
279
280#if WEIGHTS_OFFSET != 0
281 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100282#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000283 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100284#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000285#endif /* WEIGHTS_OFFSET != 0 */
286
287#if INPUT_OFFSET != 0
288 ushort sum_weights = 0;
289 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
290 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
291 values0 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100292#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000293 values1 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100294#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000295#endif /* INPUT_OFFSET != 0 */
296
297#if K_OFFSET != 0
298 values0 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100299#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000300 values1 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100301#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arena287b5702018-02-16 11:01:04 +0000302#endif /* K_OFFSET != 0 */
303
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100304#if defined(REAL_MULTIPLIER)
305
306 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
307
308#else // defined(REAL_MULTIPLIER)
309
Pablo Tello47104362019-02-27 13:32:51 +0000310 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100311
312#endif // defined(REAL_MULTIPLIER)
313
Giorgio Arena287b5702018-02-16 11:01:04 +0000314 values0 += (int8)OUTPUT_OFFSET;
315 uchar8 res0 = convert_uchar8_sat(values0);
316 res0 = max(res0, (uchar8)0);
317 res0 = min(res0, (uchar8)255);
318
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000319 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Usama Arife73686a2019-04-08 17:30:48 +0100320#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100321#if defined(REAL_MULTIPLIER)
322
323 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
324
325#else // defined(REAL_MULTIPLIER)
Giorgio Arena287b5702018-02-16 11:01:04 +0000326
Pablo Tello47104362019-02-27 13:32:51 +0000327 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100328
329#endif // defined(REAL_MULTIPLIER)
330
Giorgio Arena287b5702018-02-16 11:01:04 +0000331 values1 += (int8)OUTPUT_OFFSET;
332 uchar8 res1 = convert_uchar8_sat(values1);
333 res1 = max(res1, (uchar8)0);
334 res1 = min(res1, (uchar8)255);
335
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000336 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100337#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700338}
Giorgio Arena287b5702018-02-16 11:01:04 +0000339
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100340#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Usama Arife73686a2019-04-08 17:30:48 +0100341#if DILATION_X == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100342#if CONV_STRIDE_X == 1
343#define GET_VALUES(first_value, left, middle, right) \
344 ({ \
345 uchar8 temp0 = vload8(0, first_value); \
346 uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \
347 \
348 left = temp0.s01234567; \
349 middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \
350 right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000351 })
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100352#elif CONV_STRIDE_X == 2
353#define GET_VALUES(first_value, left, middle, right) \
354 ({ \
355 uchar16 temp0 = vload16(0, first_value); \
356 uchar temp1 = *(first_value + 16 * sizeof(uchar)); \
357 \
358 left = temp0.s02468ace; \
359 middle = temp0.s13579bdf; \
360 right = (uchar8)(temp0.s2468, temp0.sace, temp1); \
361 })
362#else /* CONV_STRIDE_X */
363#define GET_VALUES(first_value, left, middle, right) \
364 ({ \
365 uchar16 temp0 = vload16(0, first_value); \
366 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
367 \
368 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
369 middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \
370 right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \
371 })
372#endif /* CONV_STRIDE_X */
Usama Arife73686a2019-04-08 17:30:48 +0100373#else /*DILATION_X==1*/
374
375#if CONV_STRIDE_X == 1
376#define GET_VALUES(first_value, left, middle, right) \
377 ({ \
378 left = vload8(0, first_value); \
379 middle = vload8(0, first_value + DILATION_X * sizeof(uchar)); \
380 right = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
381 })
382#elif CONV_STRIDE_X == 2
383#define GET_VALUES(first_value, left, middle, right) \
384 ({ \
385 uchar16 temp0 = vload16(0, first_value); \
386 left = temp0.s02468ace; \
387 temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
388 middle = temp0.s02468ace; \
389 temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
390 right = temp0.s02468ace; \
391 })
392#else /* CONV_STRIDE_X */
393#define GET_VALUES(first_value, left, middle, right) \
394 ({ \
395 uchar16 temp0 = vload16(0, first_value); \
396 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
397 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
398 \
399 temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
400 temp1 = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))); \
401 middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
402 \
403 temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
404 temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \
405 right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
406 })
407
408#endif /* CONV_STRIDE_X */
409#endif /*DILATION_X==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100410/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000411 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000412 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
413 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000414 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000415 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000416 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000417 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
418 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000419 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000420 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
421 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
422 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
423 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
424 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
425 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
426 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
427 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
428 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
429 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
430 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
431 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
432 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
433 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
434 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
435 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
436 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
437 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
438 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
439 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
440 */
441
Pablo Tello47104362019-02-27 13:32:51 +0000442__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100443 TENSOR3D_DECLARATION(src),
444 TENSOR3D_DECLARATION(dst),
445 TENSOR3D_DECLARATION(weights)
446#if defined(HAS_BIAS)
447 ,
448 VECTOR_DECLARATION(biases)
449#endif //defined(HAS_BIAS)
450)
451{
452 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
453 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100454 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100455
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100456 // Extract channel and linearized batch indices
457 const int channel = get_global_id(2) % DST_CHANNELS;
458 const int batch = get_global_id(2) / DST_CHANNELS;
459
460#if defined(HAS_BIAS)
461 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
462
463 const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100464#endif //defined(HAS_BIAS)
465
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100466 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
467 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
468 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100469
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100470 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
471 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
472 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100473
474 uchar8 left0, middle0, right0;
475 uchar8 left1, middle1, right1;
476 uchar8 left2, middle2, right2;
477
478 int8 values0 = 0;
479 int8 sum0 = 0;
480
481 GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
Usama Arife73686a2019-04-08 17:30:48 +0100482 GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
483 GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100484
485#if WEIGHTS_OFFSET != 0
486 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
487 sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
488 sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
489#endif /* WEIGHTS_OFFSET != 0 */
490
Usama Arife73686a2019-04-08 17:30:48 +0100491#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100492 // If conv_stride_y is equals to 1, we compute two output rows
493
494 uchar8 left3, middle3, right3;
495 int8 values1 = 0;
496 int8 sum1 = 0;
497
498 GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
499
500#if WEIGHTS_OFFSET != 0
501 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
502 sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
503 sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
504#endif /* WEIGHTS_OFFSET != 0 */
Usama Arife73686a2019-04-08 17:30:48 +0100505#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100506
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100507 ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
508 ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100509 values0.s0 += right2.s0 * w2.s2;
510
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100511 ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
512 ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100513 values0.s1 += right2.s1 * w2.s2;
514
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100515 ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
516 ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100517 values0.s2 += right2.s2 * w2.s2;
518
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100519 ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
520 ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100521 values0.s3 += right2.s3 * w2.s2;
522
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100523 ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
524 ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100525 values0.s4 += right2.s4 * w2.s2;
526
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100527 ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
528 ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100529 values0.s5 += right2.s5 * w2.s2;
530
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100531 ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
532 ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100533 values0.s6 += right2.s6 * w2.s2;
534
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100535 ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
536 ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100537 values0.s7 += right2.s7 * w2.s2;
538
Usama Arife73686a2019-04-08 17:30:48 +0100539#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100540 ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
541 ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100542 values1.s0 += right3.s0 * w2.s2;
543
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100544 ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
545 ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100546 values1.s1 += right3.s1 * w2.s2;
547
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100548 ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
549 ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100550 values1.s2 += right3.s2 * w2.s2;
551
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100552 ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
553 ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100554 values1.s3 += right3.s3 * w2.s2;
555
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100556 ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
557 ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100558 values1.s4 += right3.s4 * w2.s2;
559
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100560 ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
561 ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100562 values1.s5 += right3.s5 * w2.s2;
563
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100564 ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
565 ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100566 values1.s6 += right3.s6 * w2.s2;
567
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100568 ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
569 ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100570 values1.s7 += right3.s7 * w2.s2;
Usama Arife73686a2019-04-08 17:30:48 +0100571#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100572
573#if defined(HAS_BIAS)
574 values0 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100575#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100576 values1 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100577#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100578#endif //defined(HAS_BIAS)
579
580#if WEIGHTS_OFFSET != 0
581 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100582#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100583 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100584#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100585#endif /* WEIGHTS_OFFSET != 0 */
586
587#if INPUT_OFFSET != 0
588 ushort sum_weights = 0;
589 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
590 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
591 values0 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100592#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100593 values1 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100594#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100595#endif /* INPUT_OFFSET != 0 */
596
597#if K_OFFSET != 0
598 values0 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100599#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100600 values1 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100601#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100602#endif /* K_OFFSET != 0 */
603
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100604#if defined(REAL_MULTIPLIER)
605
606 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
607
608#else // defined(REAL_MULTIPLIER)
609
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100610 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100611
612#endif // defined(REAL_MULTIPLIER)
613
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100614 values0 += (int8)OUTPUT_OFFSET;
615 uchar8 res0 = convert_uchar8_sat(values0);
616 res0 = max(res0, (uchar8)0);
617 res0 = min(res0, (uchar8)255);
618
619 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Usama Arife73686a2019-04-08 17:30:48 +0100620#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100621
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100622#if defined(REAL_MULTIPLIER)
623
624 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
625
626#else // defined(REAL_MULTIPLIER)
627
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100628 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100629
630#endif // defined(REAL_MULTIPLIER)
631
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100632 values1 += (int8)OUTPUT_OFFSET;
633 uchar8 res1 = convert_uchar8_sat(values1);
634 res1 = max(res1, (uchar8)0);
635 res1 = min(res1, (uchar8)255);
636
637 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100638#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100639}
640
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100641#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100642
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100643#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100644
645#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
646
647#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
648
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100649#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)
650
651#if WEIGHTS_OFFSET != 0
652#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
653 ({ \
654 sum += CONVERT(x, VEC_INT); \
655 MULTIPLY_ADD(x, y, acc); \
656 })
657#else /* WEIGHTS_OFFSET != 0 */
658#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
659#endif /* WEIGHTS_OFFSET != 0 */
660
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100661#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
giuros016d109962019-01-07 17:47:19 +0000662#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
663 ({ \
664 ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc); \
665 ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc); \
666 acc += val8 * w1; \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100667 })
668
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100669#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
670 ({ \
giuros016d109962019-01-07 17:47:19 +0000671 sum = val0; \
672 ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum); \
673 ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum); \
674 })
675
676#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
677 ({ \
678 sum = w1; \
679 ARM_DOT(w0.s0123, (uchar4)1, sum); \
680 ARM_DOT(w0.s4567, (uchar4)1, sum); \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100681 })
682
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100683#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100684
Pablo Tello47104362019-02-27 13:32:51 +0000685#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100686/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
687 *
Pablo Tello47104362019-02-27 13:32:51 +0000688 * @note This kernel assumes VEC_SIZE is 4.
giuros016d109962019-01-07 17:47:19 +0000689 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100690 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
691 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
692 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
693 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
694 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
695 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
696 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000697 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
698 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100699 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000700 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100701 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100702 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000703 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
704 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
705 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
706 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100707 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
708 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
709 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
710 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
711 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
712 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
713 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000714 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
715 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100716 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
giuros016d109962019-01-07 17:47:19 +0000717 * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100718 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
719 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
720 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
721 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100722 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
723 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
724 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
725 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
726 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
727 * @param[in] max_offset Max offset for the input tensor
728 */
Pablo Tello47104362019-02-27 13:32:51 +0000729__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +0000730 TENSOR4D_DECLARATION(src),
731 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +0000732 IMAGE_DECLARATION(weights),
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100733#if defined(HAS_BIAS)
734 VECTOR_DECLARATION(biases),
735#endif /* defined(HAS_BIAS) */
736 int max_offset)
737{
738 const int x = get_global_id(0); // channels
739 const int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +0000740#if defined(DST_DEPTH)
741 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
742 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +0000743#else // defined(DST_DEPTH)
744 int z = get_global_id(2); // spatial coordinate y
745#endif // defined(DST_DEPTH)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100746
giuros016d109962019-01-07 17:47:19 +0000747 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100748
Georgios Pinitas37044642018-10-30 14:53:25 +0000749#if defined(DST_DEPTH)
750 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
751#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100752 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000753#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100754
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100755 int z_coord = 0;
756 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100757 int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100758
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100759 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
760 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
761 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
762 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
763 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
764
765 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
766
giuros016d109962019-01-07 17:47:19 +0000767 // We compute VEC_SIZEx1x1 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100768 VEC_INT acc = 0, sum = 0;
769
770 // Load weights
Pablo Tello47104362019-02-27 13:32:51 +0000771 uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
772 uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
773 uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
774
775 uchar4 w0 = w0_tmp.s0123;
776 uchar4 w1 = w0_tmp.s4567;
777 uchar4 w2 = w0_tmp.s89AB;
778 uchar4 w3 = w0_tmp.sCDEF;
779
780 uchar4 w4 = w1_tmp.s0123;
781 uchar4 w5 = w1_tmp.s4567;
782 uchar4 w6 = w1_tmp.s89AB;
783 uchar4 w7 = w1_tmp.sCDEF;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100784
785#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100786 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
787 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
788 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100789#endif /* INPUT_OFFSET != 0 */
790
791 // Load input values
792 // z == 0
793 // Clamp z_coord as for z = 0, it can be negative
794 // z_coord is casted to unsigned int in order to use just a min() operation
795 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100796 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100797 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
798 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100799 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100800
801 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
802 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
803 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
804
805 // z == 1
806 // z_coord can be only negative for z = 0 so we do not need to clamp it
807 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +0100808 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100809 offset = y_offset + (int4)(z_coord * src_stride_z);
810 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
811 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
812 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
813
814 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +0100815 // Offset can be out-of-bound so we need to check if it is greater than max_offset
816 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
817 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100818 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100819 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
820 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
821 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
822
823 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
824 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
825 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
826
827 MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
828 MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
829 MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
830
831 MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
832 MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
833 MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
834
835#if defined(HAS_BIAS)
836 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
837 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
838 acc += bias_values;
839#endif // defined(HAS_BIAS)
840
841#if WEIGHTS_OFFSET != 0
842 acc += WEIGHTS_OFFSET * sum;
843#endif /* WEIGHTS_OFFSET != 0 */
844
845#if INPUT_OFFSET != 0
846 acc += INPUT_OFFSET * sum_we;
847#endif /* INPUT_OFFSET != 0 */
848
849#if K_OFFSET != 0
850 acc += (VEC_INT)K_OFFSET;
851#endif /* K_OFFSET != 0 */
852
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100853#if defined(REAL_MULTIPLIER)
854
855 acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
856
857#else // defined(REAL_MULTIPLIER)
858
Pablo Tello47104362019-02-27 13:32:51 +0000859 acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100860#endif // defined(REAL_MULTIPLIER)
861
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100862 acc += (VEC_INT)OUTPUT_OFFSET;
863
864 VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
865 res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
866
Georgios Pinitas37044642018-10-30 14:53:25 +0000867#if defined(DST_DEPTH)
868 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
869#else /* defined(DST_DEPTH) */
870 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
871#endif /* defined(DST_DEPTH) */
872
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100873 VSTORE(VEC_SIZE)
Georgios Pinitas37044642018-10-30 14:53:25 +0000874 (ACTIVATION_FUNC(res), 0, dst_addr);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100875}
876#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
877
Pablo Tello47104362019-02-27 13:32:51 +0000878#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
giuros016d109962019-01-07 17:47:19 +0000879/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100880 *
Pablo Tello47104362019-02-27 13:32:51 +0000881 * @note This kernel assumes VEC_SIZE is 4.
giuros016d109962019-01-07 17:47:19 +0000882 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100883 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
884 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
885 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
886 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
887 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
888 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
889 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000890 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
891 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100892 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000893 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100894 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100895 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000896 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
897 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
898 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
899 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100900 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
901 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
902 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
903 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
904 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
905 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
906 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000907 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
908 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100909 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
910 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
911 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
912 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
913 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
914 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100915 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
916 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
917 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
918 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
919 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
920 * @param[in] max_offset Max offset for the input tensor
921 */
922
Pablo Tello47104362019-02-27 13:32:51 +0000923__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +0000924 TENSOR4D_DECLARATION(src),
925 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +0000926 IMAGE_DECLARATION(weights),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000927#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100928 VECTOR_DECLARATION(biases),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000929#endif /* defined(HAS_BIAS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100930 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000931{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100932 int x = get_global_id(0);
933 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +0000934#if defined(DST_DEPTH)
935 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
936 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +0000937#else // defined(DST_DEPTH)
938 int z = get_global_id(2); // spatial coordinate y
939#endif // defined(DST_DEPTH)
Giorgio Arenafa23f112018-06-19 11:27:38 +0100940
giuros016d109962019-01-07 17:47:19 +0000941 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100942
Georgios Pinitas37044642018-10-30 14:53:25 +0000943#if defined(DST_DEPTH)
944 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
945#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100946 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000947#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100948
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100949 int z_coord = 0;
950 int4 offset = 0;
951 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100952
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100953 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
954 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
955 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
956 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
957 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
958
959 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
960
961 // We compute 4x2x2 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100962 VEC_INT acc0 = 0, sum0 = 0;
963 VEC_INT acc1 = 0, sum1 = 0;
964 VEC_INT acc2 = 0, sum2 = 0;
965 VEC_INT acc3 = 0, sum3 = 0;
966
967 // Load weights
Pablo Tello47104362019-02-27 13:32:51 +0000968 uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
969 uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
970 uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
971
972 uchar4 w0 = w0_tmp.s0123;
973 uchar4 w1 = w0_tmp.s4567;
974 uchar4 w2 = w0_tmp.s89AB;
975 uchar4 w3 = w0_tmp.sCDEF;
976
977 uchar4 w4 = w1_tmp.s0123;
978 uchar4 w5 = w1_tmp.s4567;
979 uchar4 w6 = w1_tmp.s89AB;
980 uchar4 w7 = w1_tmp.sCDEF;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100981
982#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100983 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
984 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
985 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100986#endif /* INPUT_OFFSET != 0 */
987
988 // Load input values
989 // z == 0
990 // Clamp z_coord as for z = 0, it can be negative
991 // z_coord is casted to unsigned int in order to use just a min() operation
992 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100993 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100994 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
995 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100996 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100997
998 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
999 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1000 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1001 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1002
1003 // z == 1
1004 // z_coord can be only negative for z = 0 so we do not need to clamp it
1005 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001006 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001007 offset = y_offset + (int4)(z_coord * src_stride_z);
1008 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1009 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1010 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1011 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1012
1013 // z == 2
1014 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1015 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1016 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001017 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001018 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1019 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1020 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1021 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1022
1023 // z == 3
1024 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1025 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1026 offset += (int4)(src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001027 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001028 VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1029 VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1030 VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1031 VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1032
1033 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
1034 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
1035 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
1036 MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
1037 MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
1038 MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
1039
1040 MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
1041 MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
1042 MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
1043 MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
1044 MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
1045 MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
1046
1047 MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
1048 MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
1049 MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
1050 MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
1051 MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
1052 MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
1053
1054 MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
1055 MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
1056 MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
1057 MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
1058 MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
1059 MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
1060
1061 MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
1062 MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
1063 MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
1064 MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
1065 MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
1066 MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
1067
1068 MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
1069 MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
1070 MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
1071 MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
1072 MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
1073 MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
1074
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001075#if defined(HAS_BIAS)
1076 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1077
1078 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001079
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001080 acc0 += bias_values;
1081 acc1 += bias_values;
1082 acc2 += bias_values;
1083 acc3 += bias_values;
1084#endif /* defined(HAS_BIAS) */
1085
1086#if WEIGHTS_OFFSET != 0
1087 acc0 += WEIGHTS_OFFSET * sum0;
1088 acc1 += WEIGHTS_OFFSET * sum1;
1089 acc2 += WEIGHTS_OFFSET * sum2;
1090 acc3 += WEIGHTS_OFFSET * sum3;
1091#endif /* WEIGHTS_OFFSET != 0 */
1092
1093#if INPUT_OFFSET != 0
1094 VEC_INT offs = INPUT_OFFSET * sum_we;
1095
1096 acc0 += offs;
1097 acc1 += offs;
1098 acc2 += offs;
1099 acc3 += offs;
1100#endif /* INPUT_OFFSET != 0 */
1101
1102#if K_OFFSET != 0
1103 acc0 += (VEC_INT)K_OFFSET;
1104 acc1 += (VEC_INT)K_OFFSET;
1105 acc2 += (VEC_INT)K_OFFSET;
1106 acc3 += (VEC_INT)K_OFFSET;
1107#endif /* K_OFFSET != 0 */
1108
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001109#if defined(REAL_MULTIPLIER)
1110
1111 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1112 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1113 acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1114 acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1115
1116#else // defined(REAL_MULTIPLIER)
1117
Pablo Tello47104362019-02-27 13:32:51 +00001118 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1119 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1120 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1121 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001122
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001123#endif // defined(REAL_MULTIPLIER)
1124
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001125 acc0 += (VEC_INT)OUTPUT_OFFSET;
1126 acc1 += (VEC_INT)OUTPUT_OFFSET;
1127 acc2 += (VEC_INT)OUTPUT_OFFSET;
1128 acc3 += (VEC_INT)OUTPUT_OFFSET;
1129
1130 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
1131 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
1132 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
1133 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
1134
1135 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1136 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
1137 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
1138 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
1139
Georgios Pinitas37044642018-10-30 14:53:25 +00001140#if defined(DST_DEPTH)
1141 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1142#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001143 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001144#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001145
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001146 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001147 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001148 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001149 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001150
1151#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1152 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1153#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1154 {
1155 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001156 (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001157 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001158 (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001159 }
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001160}
1161
giuros016d109962019-01-07 17:47:19 +00001162#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
1163/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001164 *
giuros016d109962019-01-07 17:47:19 +00001165 * @note This kernel assumes VEC_SIZE is 4.
1166 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001167 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1168 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1169 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1170 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1171 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1172 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001173 * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
1174 * If not, the quantization will be performed using a fixed point multiplication
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001175 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001176 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
1177 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001178 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001179 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001180 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001181 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001182 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1183 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1184 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1185 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001186 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
1187 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1188 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1189 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1190 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1191 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1192 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001193 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1194 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001195 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1196 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1197 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1198 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1199 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1200 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001201 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1202 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
1203 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1204 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1205 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001206 * @param[in] max_offset The maximum allowed offset for the input tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001207 */
Pablo Tello47104362019-02-27 13:32:51 +00001208__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001209 TENSOR4D_DECLARATION(src),
1210 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +00001211 IMAGE_DECLARATION(weights),
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001212#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001213 VECTOR_DECLARATION(biases),
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001214#endif // defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001215 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001216{
Giorgio Arenafa23f112018-06-19 11:27:38 +01001217 int x = get_global_id(0);
1218 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +00001219#if defined(DST_DEPTH)
1220 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1221 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +00001222#else // defined(DST_DEPTH)
1223 int z = get_global_id(2); // spatial coordinate y
1224#endif // defined(DST_DEPTH)
Giorgio Arenafa23f112018-06-19 11:27:38 +01001225
giuros016d109962019-01-07 17:47:19 +00001226 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001227
Georgios Pinitas37044642018-10-30 14:53:25 +00001228#if defined(DST_DEPTH)
1229 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
1230#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001231 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001232#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001233
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001234 int z_coord = 0;
1235 int4 offset = 0;
1236 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001237
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001238 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
1239 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
1240 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
1241 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
1242 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
1243
1244 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
1245
1246 // We compute 4x2x1 [C,W,H] elements
1247 VEC_INT acc0 = 0;
1248 VEC_INT acc1 = 0;
1249 VEC_INT sum0 = 0;
1250 VEC_INT sum1 = 0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001251
1252 // Load weights
giuros016d109962019-01-07 17:47:19 +00001253 uchar16 w0 = VLOAD(16)(0, weights_addr);
1254 uchar16 w1 = VLOAD(16)(0, weights_addr + 16);
1255 uchar4 w2 = VLOAD(4)(0, weights_addr + 32);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001256
1257#if INPUT_OFFSET != 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001258 // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
giuros016d109962019-01-07 17:47:19 +00001259 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
1260 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1261 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
1262 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001263
1264 // Multiply the weights reduction with INPUT_OFFSET
1265 acc0 = INPUT_OFFSET * acc0;
1266
1267 acc1 = acc0;
1268#endif // INPUT_OFFSET != 0
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001269
1270 // Load input values
1271 // z == 0
1272 // Clamp z_coord as for z = 0, it can be negative
1273 // z_coord is casted to unsigned int in order to use just a min() operation
1274 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001275 z_coord = z - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001276 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1277 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001278 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001279
1280 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1281 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1282 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1283 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1284
1285 // z == 1
1286 // z_coord can be only negative for z = 0 so we do not need to clamp it
1287 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001288 z_coord = z - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001289 offset = y_offset + (int4)(z_coord * src_stride_z);
1290 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1291 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1292 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1293 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1294
1295 // z == 2
1296 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1297 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1298 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001299 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001300 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1301 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1302 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1303 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1304
giuros016d109962019-01-07 17:47:19 +00001305 DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
1306 DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
1307 DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
1308 DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001309
giuros016d109962019-01-07 17:47:19 +00001310 DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
1311 DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
1312 DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1313 DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1314
1315 DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
1316 DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
1317 DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
1318 DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
1319
1320 DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
1321 DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
1322 DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
1323 DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001324
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001325#if defined(HAS_BIAS)
1326 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1327
1328 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001329
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001330 acc0 += bias_values;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001331 acc1 += bias_values;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001332
1333#endif // defined(HAS_BIAS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001334
1335#if WEIGHTS_OFFSET != 0
1336 acc0 += WEIGHTS_OFFSET * sum0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001337 acc1 += WEIGHTS_OFFSET * sum1;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001338#endif // WEIGHTS_OFFSET != 0
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001339
1340#if K_OFFSET != 0
1341 acc0 += (VEC_INT)K_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001342 acc1 += (VEC_INT)K_OFFSET;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001343
1344#endif // K_OFFSET != 0
1345
1346#if defined(REAL_MULTIPLIER)
1347
1348 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1349 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1350
1351#else // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001352
Pablo Tello47104362019-02-27 13:32:51 +00001353 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1354 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001355
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001356#endif // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001357 acc0 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001358 acc1 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001359
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001360 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001361 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001362
1363 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1364 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001365
Georgios Pinitas37044642018-10-30 14:53:25 +00001366#if defined(DST_DEPTH)
1367 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1368#else /* defined(DST_DEPTH) */
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001369 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001370#endif /* defined(DST_DEPTH) */
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001371
1372 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001373 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001374 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001375 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001376}
giuros016d109962019-01-07 17:47:19 +00001377#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001378
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001379#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001380
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001381#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
1382
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001383#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001384
1385#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
1386/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
1387 *
1388 * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
1389 * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
1390 * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
1391 * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
1392 * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
1393 * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
1394 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1395 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
1396 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
1397 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
1398 * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
1399 * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
1400 *
1401 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
1402 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1403 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1404 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1405 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1406 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1407 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1408 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1409 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1410 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1411 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr
1412 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1413 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1414 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1415 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1416 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1417 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
1418 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1419 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
1420 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1421 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1422 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1423 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1424 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1425 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
1426 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
1427 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
1428 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1429 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr
1430 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1431 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1432 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
1433 */
1434__kernel void dwc_MxN_native_quantized8_nhwc(
1435 TENSOR4D_DECLARATION(src),
1436 TENSOR4D_DECLARATION(dst),
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001437 TENSOR3D_DECLARATION(weights)
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001438#if defined(HAS_BIAS)
Michele Di Giorgio1dce3102019-10-22 10:29:03 +01001439 ,
Michele Di Giorgioa046e162019-10-08 09:36:26 +01001440 VECTOR_DECLARATION(biases)
1441#endif // defined(HAS_BIAS)
1442)
1443{
1444 int x = get_global_id(0); // channels
1445 int y = get_global_id(1); // spatial coordinate x
1446#if defined(DST_DEPTH)
1447 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1448 int b = get_global_id(2) / (int)DST_DEPTH; // batch
1449#else // defined(DST_DEPTH)
1450 int z = get_global_id(2); // spatial coordinate y
1451#endif // defined(DST_DEPTH)
1452
1453 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(uchar) * (int)N0;
1454
1455 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;
1456
1457 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(uchar) * (int)DEPTH_MULTIPLIER * (int)N0;
1458
1459#if defined(HAS_BIAS)
1460 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
1461#endif // defined(HAS_BIAS)
1462
1463#if defined(DST_DEPTH)
1464 s_addr += b * src_stride_w;
1465 d_addr += b * dst_stride_w;
1466#endif // defined(DST_DEPTH)
1467
1468#if DEPTH_MULTIPLIER > 1
1469 for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1470 {
1471#endif // DEPTH_MULTIPLIER > 1
1472 // Each work-item computes N0x1x1 elements
1473 VEC_SHORT res = 0;
1474
1475 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1476 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1477
1478 for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1479 {
1480 if(y_coord >= 0 && y_coord < SRC_DIM2)
1481 {
1482 int x_coord_tmp = x_coord;
1483
1484 for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
1485 {
1486 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1487 {
1488 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
1489 int w_offset = xk * weights_stride_y + yk * weights_stride_z;
1490
1491 // Load input and weights values
1492 VEC_SHORT i = CONVERT(VLOAD(N0)(0, (__global uchar *)(s_addr + s_offset)), VEC_SHORT);
1493 VEC_SHORT w = CONVERT(VLOAD(N0)(0, (__global uchar *)(w_addr + w_offset)), VEC_SHORT);
1494
1495 res += (i + (VEC_SHORT)INPUT_OFFSET) * (w + (VEC_SHORT)WEIGHTS_OFFSET);
1496 }
1497 x_coord_tmp += DILATION_X;
1498 }
1499 }
1500 y_coord += DILATION_Y;
1501 }
1502
1503#if defined(HAS_BIAS)
1504 VEC_SHORT bias = CONVERT(VLOAD(N0)(0, (__global int *)(b_addr)), VEC_SHORT);
1505 res += bias;
1506#endif // defined(HAS_BIAS)
1507
1508 res = CONVERT(ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(CONVERT(res, VEC_INT), OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0), VEC_SHORT);
1509 res += (VEC_SHORT)OUTPUT_OFFSET;
1510
1511 VEC_UCHAR res1 = CONVERT_SAT(res, VEC_UCHAR);
1512
1513 VSTORE(N0)
1514 (ACTIVATION_FUNC(res1), 0, (__global uchar *)(d_addr));
1515
1516#if DEPTH_MULTIPLIER > 1
1517 w_addr += sizeof(uchar);
1518 d_addr += sizeof(uchar);
1519#if defined(HAS_BIAS)
1520 b_addr += sizeof(int);
1521#endif // defined(HAS_BIAS)
1522 }
1523#endif // DEPTH_MULTIPLIER > 1
1524}
1525#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)