blob: 8f2e441693a09757a67d90f93e5c97af6bfb7444 [file] [log] [blame]
Dmitry Savenkod7295b72017-11-20 22:00:08 +07001/*
giuros016d109962019-01-07 17:47:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Dmitry Savenkod7295b72017-11-20 22:00:08 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers_asymm.h"
26
Gian Marco Iodice4b908652018-10-18 10:21:02 +010027#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
Giorgio Arena287b5702018-02-16 11:01:04 +000028
Usama Arif6a98a6e2019-05-10 17:07:27 +010029#if defined(ACTIVATION_TYPE) && defined(CONST_0)
Giorgio Arena99ac60b2018-02-16 15:17:23 +000030#define DATA_TYPE uchar
Giorgio Arenadfca60b2018-01-31 10:30:59 +000031#ifndef VEC_SIZE
Giorgio Arena99ac60b2018-02-16 15:17:23 +000032#define VEC_SIZE 8
Giorgio Arenadfca60b2018-01-31 10:30:59 +000033#endif /* VEC_SIZE */
Manuel Bottini30dbeef2019-06-26 16:23:03 +010034#include "activation_layer_quant.cl"
35#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
Usama Arif6a98a6e2019-05-10 17:07:27 +010036#else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000037#define ACTIVATION_FUNC(x) (x)
Usama Arif6a98a6e2019-05-10 17:07:27 +010038#endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000039
Georgios Pinitasdaa38552018-08-28 17:43:18 +010040#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
41#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010042#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
Georgios Pinitasdaa38552018-08-28 17:43:18 +010043#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010044#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010045#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
46#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +010047
Georgios Pinitase55b40a2018-09-13 17:20:04 +010048#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +000049
Giorgio Arena287b5702018-02-16 11:01:04 +000050#if CONV_STRIDE_X > 3
51#error "Stride X not supported"
52#endif /* CONV_STRIDE_X > 3 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +070053
Georgios Pinitasdaa38552018-08-28 17:43:18 +010054#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Giorgio Arenaeff8d952018-07-02 15:29:57 +010055
Usama Arife73686a2019-04-08 17:30:48 +010056#if DILATION_X == 1
57
Dmitry Savenkod7295b72017-11-20 22:00:08 +070058#if CONV_STRIDE_X == 1
Giorgio Arena287b5702018-02-16 11:01:04 +000059#define GET_VALUES(first_value, left, middle, right) \
60 ({ \
61 int8 temp0 = CONVERT(vload8(0, first_value), int8); \
62 int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
63 \
64 left = CONVERT(temp0.s01234567, int8); \
65 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
66 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
67 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070068#elif CONV_STRIDE_X == 2
Giorgio Arena287b5702018-02-16 11:01:04 +000069#define GET_VALUES(first_value, left, middle, right) \
70 ({ \
71 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
72 int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
73 \
74 left = CONVERT(temp0.s02468ace, int8); \
75 middle = CONVERT(temp0.s13579bdf, int8); \
76 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
77 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070078#else /* CONV_STRIDE_X */
Giorgio Arena287b5702018-02-16 11:01:04 +000079#define GET_VALUES(first_value, left, middle, right) \
80 ({ \
81 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
82 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
83 \
84 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
85 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
86 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
87 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070088#endif /* CONV_STRIDE_X */
89
Usama Arife73686a2019-04-08 17:30:48 +010090#else /* DILATION_X == 1 */
91
92#if CONV_STRIDE_X == 1
93#define GET_VALUES(first_value, left, middle, right) \
94 ({ \
95 left = CONVERT(vload8(0, first_value), int8); \
96 middle = CONVERT(vload8(0, first_value + DILATION_X * sizeof(uchar)), int8); \
97 right = CONVERT(vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)), int8); \
98 })
99#elif CONV_STRIDE_X == 2
100#define GET_VALUES(first_value, left, middle, right) \
101 ({ \
102 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
103 left = CONVERT(temp0.s02468ace, int8); \
104 \
105 temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
106 middle = CONVERT(temp0.s02468ace, int8); \
107 \
108 temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
109 right = CONVERT(temp0.s02468ace, int8); \
110 })
111#else /* CONV_STRIDE_X */
112#define GET_VALUES(first_value, left, middle, right) \
113 ({ \
114 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
115 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
116 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
117 \
118 temp0 = CONVERT(vload16(0, first_value + DILATION_X * sizeof(uchar)), int16); \
119 temp1 = CONVERT(vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))), int8); \
120 middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
121 \
122 temp0 = CONVERT(vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)), int16); \
123 temp1 = CONVERT(vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))), int8); \
124 right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
125 })
126
127#endif /* CONV_STRIDE_X */
128#endif /* DILATION_X==1 */
129
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000130/** This function computes the depthwise convolution quantized.
Anthony Barbierf202e502017-11-23 18:02:04 +0000131 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000132 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
133 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000134 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000135 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000136 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Anthony Barbierf202e502017-11-23 18:02:04 +0000137 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
138 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000139 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Anthony Barbierf202e502017-11-23 18:02:04 +0000140 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
141 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
142 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
143 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
144 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
145 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
146 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
147 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
148 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
149 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
150 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
151 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
152 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
153 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
154 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
155 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
156 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
157 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
158 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
159 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Anthony Barbierf202e502017-11-23 18:02:04 +0000160 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700161
Pablo Tello47104362019-02-27 13:32:51 +0000162__kernel void dwc_3x3_native_qasymm8_nchw(
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700163 TENSOR3D_DECLARATION(src),
164 TENSOR3D_DECLARATION(dst),
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000165 TENSOR3D_DECLARATION(weights)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700166#if defined(HAS_BIAS)
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000167 ,
Giorgio Arena287b5702018-02-16 11:01:04 +0000168 VECTOR_DECLARATION(biases)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700169#endif //defined(HAS_BIAS)
Giorgio Arena287b5702018-02-16 11:01:04 +0000170)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700171{
172 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
173 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100174 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
175
176 // Extract channel and linearized batch indices
177 const int channel = get_global_id(2) % DST_CHANNELS;
178 const int batch = get_global_id(2) / DST_CHANNELS;
179
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700180#if defined(HAS_BIAS)
181 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700182
Georgios Pinitas728d3cf2018-09-21 13:41:35 +0100183 int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700184#endif //defined(HAS_BIAS)
185
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100186 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
187 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
188 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arena76572242018-04-04 17:44:26 +0100189
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100190 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
191 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
192 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700193
Giorgio Arena287b5702018-02-16 11:01:04 +0000194 int8 values0 = 0;
195 int8 sum0 = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100196#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000197 int8 values1 = 0;
198 int8 sum1 = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100199#endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000200
201 // Row0
202 int8 left, middle, right;
203 GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
204 values0 += left * (int8)(w0.s0);
205 values0 += middle * (int8)(w0.s1);
206 values0 += right * (int8)(w0.s2);
207
208#if WEIGHTS_OFFSET != 0
209 sum0 += left + middle + right;
210#endif /* WEIGHTS_OFFSET != 0 */
211
212 // Row1
Usama Arife73686a2019-04-08 17:30:48 +0100213 GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left, middle, right);
Giorgio Arena287b5702018-02-16 11:01:04 +0000214 values0 += left * (int8)(w1.s0);
215 values0 += middle * (int8)(w1.s1);
216 values0 += right * (int8)(w1.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100217#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000218 values1 += left * (int8)(w0.s0);
219 values1 += middle * (int8)(w0.s1);
220 values1 += right * (int8)(w0.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100221#endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000222
223#if WEIGHTS_OFFSET != 0
224 int8 tmp = left + middle + right;
225 sum0 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100226#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000227 sum1 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100228#endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000229#endif /* WEIGHTS_OFFSET != 0 */
230
231 // Row2
Usama Arife73686a2019-04-08 17:30:48 +0100232 GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left, middle, right);
Giorgio Arena287b5702018-02-16 11:01:04 +0000233 values0 += left * (int8)(w2.s0);
234 values0 += middle * (int8)(w2.s1);
235 values0 += right * (int8)(w2.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100236#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000237 values1 += left * (int8)(w1.s0);
238 values1 += middle * (int8)(w1.s1);
239 values1 += right * (int8)(w1.s2);
Usama Arife73686a2019-04-08 17:30:48 +0100240#endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000241
242#if WEIGHTS_OFFSET != 0
243 tmp = left + middle + right;
244 sum0 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100245#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000246 sum1 += tmp;
Usama Arife73686a2019-04-08 17:30:48 +0100247#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000248#endif /* WEIGHTS_OFFSET != 0 */
249
Usama Arife73686a2019-04-08 17:30:48 +0100250#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000251 // Row3
252 GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
253 values1 += left * (int8)(w2.s0);
254 values1 += middle * (int8)(w2.s1);
255 values1 += right * (int8)(w2.s2);
256
257#if WEIGHTS_OFFSET != 0
258 sum1 += left + middle + right;
259#endif /* WEIGHTS_OFFSET != 0 */
Usama Arife73686a2019-04-08 17:30:48 +0100260#endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000261
262#if defined(HAS_BIAS)
263 values0 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100264#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000265 values1 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100266#endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000267#endif //defined(HAS_BIAS)
268
269#if WEIGHTS_OFFSET != 0
270 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100271#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000272 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100273#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000274#endif /* WEIGHTS_OFFSET != 0 */
275
276#if INPUT_OFFSET != 0
277 ushort sum_weights = 0;
278 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
279 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
280 values0 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100281#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000282 values1 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100283#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arena287b5702018-02-16 11:01:04 +0000284#endif /* INPUT_OFFSET != 0 */
285
286#if K_OFFSET != 0
287 values0 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100288#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arena287b5702018-02-16 11:01:04 +0000289 values1 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100290#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arena287b5702018-02-16 11:01:04 +0000291#endif /* K_OFFSET != 0 */
292
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100293#if defined(REAL_MULTIPLIER)
294
295 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
296
297#else // defined(REAL_MULTIPLIER)
298
Pablo Tello47104362019-02-27 13:32:51 +0000299 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100300
301#endif // defined(REAL_MULTIPLIER)
302
Giorgio Arena287b5702018-02-16 11:01:04 +0000303 values0 += (int8)OUTPUT_OFFSET;
304 uchar8 res0 = convert_uchar8_sat(values0);
305 res0 = max(res0, (uchar8)0);
306 res0 = min(res0, (uchar8)255);
307
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000308 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Usama Arife73686a2019-04-08 17:30:48 +0100309#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100310#if defined(REAL_MULTIPLIER)
311
312 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
313
314#else // defined(REAL_MULTIPLIER)
Giorgio Arena287b5702018-02-16 11:01:04 +0000315
Pablo Tello47104362019-02-27 13:32:51 +0000316 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100317
318#endif // defined(REAL_MULTIPLIER)
319
Giorgio Arena287b5702018-02-16 11:01:04 +0000320 values1 += (int8)OUTPUT_OFFSET;
321 uchar8 res1 = convert_uchar8_sat(values1);
322 res1 = max(res1, (uchar8)0);
323 res1 = min(res1, (uchar8)255);
324
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000325 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100326#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700327}
Giorgio Arena287b5702018-02-16 11:01:04 +0000328
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100329#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
Usama Arife73686a2019-04-08 17:30:48 +0100330#if DILATION_X == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100331#if CONV_STRIDE_X == 1
332#define GET_VALUES(first_value, left, middle, right) \
333 ({ \
334 uchar8 temp0 = vload8(0, first_value); \
335 uchar2 temp1 = vload2(0, (first_value + 8 * sizeof(uchar))); \
336 \
337 left = temp0.s01234567; \
338 middle = (uchar8)(temp0.s1234, temp0.s567, temp1.s0); \
339 right = (uchar8)(temp0.s2345, temp0.s67, temp1.s01); \
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000340 })
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100341#elif CONV_STRIDE_X == 2
342#define GET_VALUES(first_value, left, middle, right) \
343 ({ \
344 uchar16 temp0 = vload16(0, first_value); \
345 uchar temp1 = *(first_value + 16 * sizeof(uchar)); \
346 \
347 left = temp0.s02468ace; \
348 middle = temp0.s13579bdf; \
349 right = (uchar8)(temp0.s2468, temp0.sace, temp1); \
350 })
351#else /* CONV_STRIDE_X */
352#define GET_VALUES(first_value, left, middle, right) \
353 ({ \
354 uchar16 temp0 = vload16(0, first_value); \
355 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
356 \
357 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
358 middle = (uchar8)(temp0.s147a, temp0.sd, temp1.s036); \
359 right = (uchar8)(temp0.s258b, temp0.se, temp1.s147); \
360 })
361#endif /* CONV_STRIDE_X */
Usama Arife73686a2019-04-08 17:30:48 +0100362#else /*DILATION_X==1*/
363
364#if CONV_STRIDE_X == 1
365#define GET_VALUES(first_value, left, middle, right) \
366 ({ \
367 left = vload8(0, first_value); \
368 middle = vload8(0, first_value + DILATION_X * sizeof(uchar)); \
369 right = vload8(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
370 })
371#elif CONV_STRIDE_X == 2
372#define GET_VALUES(first_value, left, middle, right) \
373 ({ \
374 uchar16 temp0 = vload16(0, first_value); \
375 left = temp0.s02468ace; \
376 temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
377 middle = temp0.s02468ace; \
378 temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
379 right = temp0.s02468ace; \
380 })
381#else /* CONV_STRIDE_X */
382#define GET_VALUES(first_value, left, middle, right) \
383 ({ \
384 uchar16 temp0 = vload16(0, first_value); \
385 uchar8 temp1 = vload8(0, (first_value + 16 * sizeof(uchar))); \
386 left = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
387 \
388 temp0 = vload16(0, first_value + DILATION_X * sizeof(uchar)); \
389 temp1 = vload8(0, (first_value + (16 + DILATION_X) * sizeof(uchar))); \
390 middle = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
391 \
392 temp0 = vload16(0, first_value + 2 * DILATION_X * sizeof(uchar)); \
393 temp1 = vload8(0, (first_value + (16 + 2 * DILATION_X) * sizeof(uchar))); \
394 right = (uchar8)(temp0.s0369, temp0.scf, temp1.s25); \
395 })
396
397#endif /* CONV_STRIDE_X */
398#endif /*DILATION_X==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100399/** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000400 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000401 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
402 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000403 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000404 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000405 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000406 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
407 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000408 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000409 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
410 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
411 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
412 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
413 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
414 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
415 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
416 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
417 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
418 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
419 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
420 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
421 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
422 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
423 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
424 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
425 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
426 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
427 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
429 */
430
Pablo Tello47104362019-02-27 13:32:51 +0000431__kernel void dwc_3x3_native_qasymm8_dot8_nchw(
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100432 TENSOR3D_DECLARATION(src),
433 TENSOR3D_DECLARATION(dst),
434 TENSOR3D_DECLARATION(weights)
435#if defined(HAS_BIAS)
436 ,
437 VECTOR_DECLARATION(biases)
438#endif //defined(HAS_BIAS)
439)
440{
441 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
442 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100443 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100444
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100445 // Extract channel and linearized batch indices
446 const int channel = get_global_id(2) % DST_CHANNELS;
447 const int batch = get_global_id(2) / DST_CHANNELS;
448
449#if defined(HAS_BIAS)
450 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
451
452 const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100453#endif //defined(HAS_BIAS)
454
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100455 // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
456 src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
457 __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100458
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100459 uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
460 uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
461 uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100462
463 uchar8 left0, middle0, right0;
464 uchar8 left1, middle1, right1;
465 uchar8 left2, middle2, right2;
466
467 int8 values0 = 0;
468 int8 sum0 = 0;
469
470 GET_VALUES(src.ptr + 0 * src_stride_y, left0, middle0, right0);
Usama Arife73686a2019-04-08 17:30:48 +0100471 GET_VALUES(src.ptr + DILATION_Y * src_stride_y, left1, middle1, right1);
472 GET_VALUES(src.ptr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100473
474#if WEIGHTS_OFFSET != 0
475 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
476 sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
477 sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
478#endif /* WEIGHTS_OFFSET != 0 */
479
Usama Arife73686a2019-04-08 17:30:48 +0100480#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100481 // If conv_stride_y is equals to 1, we compute two output rows
482
483 uchar8 left3, middle3, right3;
484 int8 values1 = 0;
485 int8 sum1 = 0;
486
487 GET_VALUES(src.ptr + 3 * src_stride_y, left3, middle3, right3);
488
489#if WEIGHTS_OFFSET != 0
490 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
491 sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
492 sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
493#endif /* WEIGHTS_OFFSET != 0 */
Usama Arife73686a2019-04-08 17:30:48 +0100494#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100495
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100496 ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
497 ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100498 values0.s0 += right2.s0 * w2.s2;
499
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100500 ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
501 ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100502 values0.s1 += right2.s1 * w2.s2;
503
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100504 ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
505 ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100506 values0.s2 += right2.s2 * w2.s2;
507
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100508 ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
509 ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100510 values0.s3 += right2.s3 * w2.s2;
511
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100512 ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
513 ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100514 values0.s4 += right2.s4 * w2.s2;
515
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100516 ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
517 ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100518 values0.s5 += right2.s5 * w2.s2;
519
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100520 ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
521 ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100522 values0.s6 += right2.s6 * w2.s2;
523
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100524 ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
525 ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100526 values0.s7 += right2.s7 * w2.s2;
527
Usama Arife73686a2019-04-08 17:30:48 +0100528#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100529 ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
530 ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100531 values1.s0 += right3.s0 * w2.s2;
532
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100533 ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
534 ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100535 values1.s1 += right3.s1 * w2.s2;
536
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100537 ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
538 ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100539 values1.s2 += right3.s2 * w2.s2;
540
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100541 ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
542 ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100543 values1.s3 += right3.s3 * w2.s2;
544
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100545 ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
546 ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100547 values1.s4 += right3.s4 * w2.s2;
548
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100549 ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
550 ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100551 values1.s5 += right3.s5 * w2.s2;
552
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100553 ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
554 ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100555 values1.s6 += right3.s6 * w2.s2;
556
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100557 ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
558 ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100559 values1.s7 += right3.s7 * w2.s2;
Usama Arife73686a2019-04-08 17:30:48 +0100560#endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100561
562#if defined(HAS_BIAS)
563 values0 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100564#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100565 values1 += (int8)(bias_value);
Usama Arife73686a2019-04-08 17:30:48 +0100566#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100567#endif //defined(HAS_BIAS)
568
569#if WEIGHTS_OFFSET != 0
570 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100571#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100572 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100573#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100574#endif /* WEIGHTS_OFFSET != 0 */
575
576#if INPUT_OFFSET != 0
577 ushort sum_weights = 0;
578 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
579 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
580 values0 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100581#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100582 values1 += sum_weights * (int8)(INPUT_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100583#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100584#endif /* INPUT_OFFSET != 0 */
585
586#if K_OFFSET != 0
587 values0 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100588#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100589 values1 += (int8)(K_OFFSET);
Usama Arife73686a2019-04-08 17:30:48 +0100590#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100591#endif /* K_OFFSET != 0 */
592
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100593#if defined(REAL_MULTIPLIER)
594
595 values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
596
597#else // defined(REAL_MULTIPLIER)
598
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100599 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100600
601#endif // defined(REAL_MULTIPLIER)
602
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100603 values0 += (int8)OUTPUT_OFFSET;
604 uchar8 res0 = convert_uchar8_sat(values0);
605 res0 = max(res0, (uchar8)0);
606 res0 = min(res0, (uchar8)255);
607
608 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Usama Arife73686a2019-04-08 17:30:48 +0100609#if CONV_STRIDE_Y == 1 && DILATION_Y == 1
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100610
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100611#if defined(REAL_MULTIPLIER)
612
613 values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
614
615#else // defined(REAL_MULTIPLIER)
616
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100617 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100618
619#endif // defined(REAL_MULTIPLIER)
620
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100621 values1 += (int8)OUTPUT_OFFSET;
622 uchar8 res1 = convert_uchar8_sat(values1);
623 res1 = max(res1, (uchar8)0);
624 res1 = min(res1, (uchar8)255);
625
626 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Usama Arife73686a2019-04-08 17:30:48 +0100627#endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100628}
629
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100630#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100631
Georgios Pinitase55b40a2018-09-13 17:20:04 +0100632#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100633
634#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
635
636#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
637
638#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100639#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100640#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
641#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
642
643#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)
644
645#if WEIGHTS_OFFSET != 0
646#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
647 ({ \
648 sum += CONVERT(x, VEC_INT); \
649 MULTIPLY_ADD(x, y, acc); \
650 })
651#else /* WEIGHTS_OFFSET != 0 */
652#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
653#endif /* WEIGHTS_OFFSET != 0 */
654
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100655#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
giuros016d109962019-01-07 17:47:19 +0000656#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
657 ({ \
658 ARM_DOT((uchar4)(val0, val1, val2, val3), w0.s0123, acc); \
659 ARM_DOT((uchar4)(val4, val5, val6, val7), w0.s4567, acc); \
660 acc += val8 * w1; \
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100661 })
662
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100663#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
664 ({ \
giuros016d109962019-01-07 17:47:19 +0000665 sum = val0; \
666 ARM_DOT((uchar4)(val1, val2, val3, val4), (uchar4)1, sum); \
667 ARM_DOT((uchar4)(val5, val6, val7, val8), (uchar4)1, sum); \
668 })
669
670#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
671 ({ \
672 sum = w1; \
673 ARM_DOT(w0.s0123, (uchar4)1, sum); \
674 ARM_DOT(w0.s4567, (uchar4)1, sum); \
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100675 })
676
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100677#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100678
Pablo Tello47104362019-02-27 13:32:51 +0000679#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100680/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
681 *
Pablo Tello47104362019-02-27 13:32:51 +0000682 * @note This kernel assumes VEC_SIZE is 4.
giuros016d109962019-01-07 17:47:19 +0000683 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100684 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
685 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
686 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
687 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
688 * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
689 * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
690 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000691 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
692 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100693 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000694 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100695 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100696 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000697 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
698 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
699 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
700 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100701 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
702 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
703 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
704 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
705 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
706 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
707 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000708 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
709 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100710 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
giuros016d109962019-01-07 17:47:19 +0000711 * @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: same as @p src_ptr
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100712 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
713 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
714 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
715 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100716 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
717 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
718 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
719 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
720 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
721 * @param[in] max_offset Max offset for the input tensor
722 */
Pablo Tello47104362019-02-27 13:32:51 +0000723__kernel void dwc_3x3_reshaped_qasymm8_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +0000724 TENSOR4D_DECLARATION(src),
725 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +0000726 IMAGE_DECLARATION(weights),
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100727#if defined(HAS_BIAS)
728 VECTOR_DECLARATION(biases),
729#endif /* defined(HAS_BIAS) */
730 int max_offset)
731{
732 const int x = get_global_id(0); // channels
733 const int y = get_global_id(1); // spatial coordinate x
Georgios Pinitas37044642018-10-30 14:53:25 +0000734#if defined(DST_DEPTH)
735 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
736 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +0000737#else // defined(DST_DEPTH)
738 int z = get_global_id(2); // spatial coordinate y
739#endif // defined(DST_DEPTH)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100740
giuros016d109962019-01-07 17:47:19 +0000741 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100742
Georgios Pinitas37044642018-10-30 14:53:25 +0000743#if defined(DST_DEPTH)
744 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
745#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100746 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000747#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100748
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100749 int z_coord = 0;
750 int4 offset = 0;
Usama Arife73686a2019-04-08 17:30:48 +0100751 int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100752
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100753 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
754 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
755 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
756 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
757 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
758
759 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
760
giuros016d109962019-01-07 17:47:19 +0000761 // We compute VEC_SIZEx1x1 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100762 VEC_INT acc = 0, sum = 0;
763
764 // Load weights
Pablo Tello47104362019-02-27 13:32:51 +0000765 uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
766 uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
767 uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
768
769 uchar4 w0 = w0_tmp.s0123;
770 uchar4 w1 = w0_tmp.s4567;
771 uchar4 w2 = w0_tmp.s89AB;
772 uchar4 w3 = w0_tmp.sCDEF;
773
774 uchar4 w4 = w1_tmp.s0123;
775 uchar4 w5 = w1_tmp.s4567;
776 uchar4 w6 = w1_tmp.s89AB;
777 uchar4 w7 = w1_tmp.sCDEF;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100778
779#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100780 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
781 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
782 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100783#endif /* INPUT_OFFSET != 0 */
784
785 // Load input values
786 // z == 0
787 // Clamp z_coord as for z = 0, it can be negative
788 // z_coord is casted to unsigned int in order to use just a min() operation
789 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100790 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100791 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
792 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100793 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100794
795 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
796 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
797 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
798
799 // z == 1
800 // z_coord can be only negative for z = 0 so we do not need to clamp it
801 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Usama Arife73686a2019-04-08 17:30:48 +0100802 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100803 offset = y_offset + (int4)(z_coord * src_stride_z);
804 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
805 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
806 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
807
808 // z == 2
Usama Arife73686a2019-04-08 17:30:48 +0100809 // Offset can be out-of-bound so we need to check if it is greater than max_offset
810 z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;
811 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100812 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100813 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
814 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
815 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
816
817 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);
818 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);
819 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);
820
821 MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);
822 MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);
823 MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);
824
825 MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);
826 MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);
827 MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);
828
829#if defined(HAS_BIAS)
830 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
831 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
832 acc += bias_values;
833#endif // defined(HAS_BIAS)
834
835#if WEIGHTS_OFFSET != 0
836 acc += WEIGHTS_OFFSET * sum;
837#endif /* WEIGHTS_OFFSET != 0 */
838
839#if INPUT_OFFSET != 0
840 acc += INPUT_OFFSET * sum_we;
841#endif /* INPUT_OFFSET != 0 */
842
843#if K_OFFSET != 0
844 acc += (VEC_INT)K_OFFSET;
845#endif /* K_OFFSET != 0 */
846
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100847#if defined(REAL_MULTIPLIER)
848
849 acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
850
851#else // defined(REAL_MULTIPLIER)
852
Pablo Tello47104362019-02-27 13:32:51 +0000853 acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100854#endif // defined(REAL_MULTIPLIER)
855
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100856 acc += (VEC_INT)OUTPUT_OFFSET;
857
858 VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
859 res = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
860
Georgios Pinitas37044642018-10-30 14:53:25 +0000861#if defined(DST_DEPTH)
862 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
863#else /* defined(DST_DEPTH) */
864 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
865#endif /* defined(DST_DEPTH) */
866
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100867 VSTORE(VEC_SIZE)
Georgios Pinitas37044642018-10-30 14:53:25 +0000868 (ACTIVATION_FUNC(res), 0, dst_addr);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100869}
870#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
871
Pablo Tello47104362019-02-27 13:32:51 +0000872#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4
giuros016d109962019-01-07 17:47:19 +0000873/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100874 *
Pablo Tello47104362019-02-27 13:32:51 +0000875 * @note This kernel assumes VEC_SIZE is 4.
giuros016d109962019-01-07 17:47:19 +0000876 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100877 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
878 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
879 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
880 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
881 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
882 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
883 *
Georgios Pinitas37044642018-10-30 14:53:25 +0000884 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
885 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100886 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000887 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100888 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100889 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000890 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
891 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
892 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
893 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100894 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
895 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
896 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
897 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
898 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
899 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
900 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +0000901 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
902 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100903 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
904 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
905 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
906 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
907 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
908 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100909 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
910 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
911 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
912 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
913 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
914 * @param[in] max_offset Max offset for the input tensor
915 */
916
Pablo Tello47104362019-02-27 13:32:51 +0000917__kernel void dwc_3x3_reshaped_qasymm8_stride1_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +0000918 TENSOR4D_DECLARATION(src),
919 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +0000920 IMAGE_DECLARATION(weights),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000921#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100922 VECTOR_DECLARATION(biases),
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000923#endif /* defined(HAS_BIAS) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100924 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000925{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100926 int x = get_global_id(0);
927 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +0000928#if defined(DST_DEPTH)
929 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
930 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +0000931#else // defined(DST_DEPTH)
932 int z = get_global_id(2); // spatial coordinate y
933#endif // defined(DST_DEPTH)
Giorgio Arenafa23f112018-06-19 11:27:38 +0100934
giuros016d109962019-01-07 17:47:19 +0000935 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100936
Georgios Pinitas37044642018-10-30 14:53:25 +0000937#if defined(DST_DEPTH)
938 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
939#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100940 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +0000941#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100942
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100943 int z_coord = 0;
944 int4 offset = 0;
945 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100946
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100947 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
948 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
949 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
950 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
951 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
952
953 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
954
955 // We compute 4x2x2 [C,W,H] elements
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100956 VEC_INT acc0 = 0, sum0 = 0;
957 VEC_INT acc1 = 0, sum1 = 0;
958 VEC_INT acc2 = 0, sum2 = 0;
959 VEC_INT acc3 = 0, sum3 = 0;
960
961 // Load weights
Pablo Tello47104362019-02-27 13:32:51 +0000962 uchar16 w0_tmp = VLOAD(16)(0, weights_addr);
963 uchar16 w1_tmp = VLOAD(16)(0, weights_addr + 16);
964 uchar4 w8 = VLOAD(4)(0, weights_addr + 2 * 16);
965
966 uchar4 w0 = w0_tmp.s0123;
967 uchar4 w1 = w0_tmp.s4567;
968 uchar4 w2 = w0_tmp.s89AB;
969 uchar4 w3 = w0_tmp.sCDEF;
970
971 uchar4 w4 = w1_tmp.s0123;
972 uchar4 w5 = w1_tmp.s4567;
973 uchar4 w6 = w1_tmp.s89AB;
974 uchar4 w7 = w1_tmp.sCDEF;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100975
976#if INPUT_OFFSET != 0
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100977 VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
978 + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
979 + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100980#endif /* INPUT_OFFSET != 0 */
981
982 // Load input values
983 // z == 0
984 // Clamp z_coord as for z = 0, it can be negative
985 // z_coord is casted to unsigned int in order to use just a min() operation
986 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100987 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100988 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
989 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +0100990 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +0100991
992 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
993 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
994 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
995 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
996
997 // z == 1
998 // z_coord can be only negative for z = 0 so we do not need to clamp it
999 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001000 z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001001 offset = y_offset + (int4)(z_coord * src_stride_z);
1002 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1003 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1004 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1005 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1006
1007 // z == 2
1008 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1009 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1010 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001011 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001012 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1013 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1014 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1015 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1016
1017 // z == 3
1018 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1019 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1020 offset += (int4)(src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001021 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001022 VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1023 VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1024 VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1025 VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1026
1027 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
1028 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
1029 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
1030 MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
1031 MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
1032 MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
1033
1034 MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
1035 MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
1036 MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
1037 MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
1038 MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
1039 MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
1040
1041 MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
1042 MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
1043 MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
1044 MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
1045 MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
1046 MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
1047
1048 MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
1049 MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
1050 MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
1051 MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
1052 MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
1053 MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
1054
1055 MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
1056 MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
1057 MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
1058 MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
1059 MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
1060 MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
1061
1062 MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
1063 MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
1064 MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
1065 MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
1066 MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
1067 MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
1068
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001069#if defined(HAS_BIAS)
1070 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1071
1072 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001073
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001074 acc0 += bias_values;
1075 acc1 += bias_values;
1076 acc2 += bias_values;
1077 acc3 += bias_values;
1078#endif /* defined(HAS_BIAS) */
1079
1080#if WEIGHTS_OFFSET != 0
1081 acc0 += WEIGHTS_OFFSET * sum0;
1082 acc1 += WEIGHTS_OFFSET * sum1;
1083 acc2 += WEIGHTS_OFFSET * sum2;
1084 acc3 += WEIGHTS_OFFSET * sum3;
1085#endif /* WEIGHTS_OFFSET != 0 */
1086
1087#if INPUT_OFFSET != 0
1088 VEC_INT offs = INPUT_OFFSET * sum_we;
1089
1090 acc0 += offs;
1091 acc1 += offs;
1092 acc2 += offs;
1093 acc3 += offs;
1094#endif /* INPUT_OFFSET != 0 */
1095
1096#if K_OFFSET != 0
1097 acc0 += (VEC_INT)K_OFFSET;
1098 acc1 += (VEC_INT)K_OFFSET;
1099 acc2 += (VEC_INT)K_OFFSET;
1100 acc3 += (VEC_INT)K_OFFSET;
1101#endif /* K_OFFSET != 0 */
1102
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001103#if defined(REAL_MULTIPLIER)
1104
1105 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1106 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1107 acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1108 acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1109
1110#else // defined(REAL_MULTIPLIER)
1111
Pablo Tello47104362019-02-27 13:32:51 +00001112 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1113 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1114 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1115 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001116
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001117#endif // defined(REAL_MULTIPLIER)
1118
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001119 acc0 += (VEC_INT)OUTPUT_OFFSET;
1120 acc1 += (VEC_INT)OUTPUT_OFFSET;
1121 acc2 += (VEC_INT)OUTPUT_OFFSET;
1122 acc3 += (VEC_INT)OUTPUT_OFFSET;
1123
1124 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
1125 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
1126 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
1127 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
1128
1129 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1130 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
1131 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
1132 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
1133
Georgios Pinitas37044642018-10-30 14:53:25 +00001134#if defined(DST_DEPTH)
1135 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1136#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001137 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001138#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001139
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001140 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001141 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001142 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001143 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001144
1145#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1146 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1147#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
1148 {
1149 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001150 (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001151 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001152 (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001153 }
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001154}
1155
giuros016d109962019-01-07 17:47:19 +00001156#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4
1157/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001158 *
giuros016d109962019-01-07 17:47:19 +00001159 * @note This kernel assumes VEC_SIZE is 4.
1160 * @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001161 * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
1162 * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
1163 * @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)
1164 * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
1165 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
1166 * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001167 * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
1168 * If not, the quantization will be performed using a fixed point multiplication
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001169 *
Georgios Pinitas37044642018-10-30 14:53:25 +00001170 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8
1171 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001172 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001173 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001174 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001175 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001176 * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
1177 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
1178 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
1179 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001180 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
1181 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1182 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1183 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1184 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1185 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1186 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
Georgios Pinitas37044642018-10-30 14:53:25 +00001187 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
1188 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001189 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1190 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
1191 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
1192 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
1193 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
1194 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001195 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
1196 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
1197 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
1198 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1199 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001200 * @param[in] max_offset The maximum allowed offset for the input tensor
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001201 */
Pablo Tello47104362019-02-27 13:32:51 +00001202__kernel void dwc_3x3_reshaped_qasymm8_dot8_stride1_nhwc(
Georgios Pinitas37044642018-10-30 14:53:25 +00001203 TENSOR4D_DECLARATION(src),
1204 TENSOR4D_DECLARATION(dst),
giuros016d109962019-01-07 17:47:19 +00001205 IMAGE_DECLARATION(weights),
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001206#if defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001207 VECTOR_DECLARATION(biases),
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001208#endif // defined(HAS_BIAS)
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001209 int max_offset)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001210{
Giorgio Arenafa23f112018-06-19 11:27:38 +01001211 int x = get_global_id(0);
1212 int y = get_global_id(1);
Georgios Pinitas37044642018-10-30 14:53:25 +00001213#if defined(DST_DEPTH)
1214 int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
1215 int b = get_global_id(2) / (int)DST_DEPTH; // batch
Pablo Tello47104362019-02-27 13:32:51 +00001216#else // defined(DST_DEPTH)
1217 int z = get_global_id(2); // spatial coordinate y
1218#endif // defined(DST_DEPTH)
Giorgio Arenafa23f112018-06-19 11:27:38 +01001219
giuros016d109962019-01-07 17:47:19 +00001220 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001221
Georgios Pinitas37044642018-10-30 14:53:25 +00001222#if defined(DST_DEPTH)
1223 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
1224#else /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001225 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
Georgios Pinitas37044642018-10-30 14:53:25 +00001226#endif /* defined(DST_DEPTH) */
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001227
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001228 int z_coord = 0;
1229 int4 offset = 0;
1230 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001231
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001232 // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
1233 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
1234 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
1235 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
1236 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
1237
1238 int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
1239
1240 // We compute 4x2x1 [C,W,H] elements
1241 VEC_INT acc0 = 0;
1242 VEC_INT acc1 = 0;
1243 VEC_INT sum0 = 0;
1244 VEC_INT sum1 = 0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001245
1246 // Load weights
giuros016d109962019-01-07 17:47:19 +00001247 uchar16 w0 = VLOAD(16)(0, weights_addr);
1248 uchar16 w1 = VLOAD(16)(0, weights_addr + 16);
1249 uchar4 w2 = VLOAD(4)(0, weights_addr + 32);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001250
1251#if INPUT_OFFSET != 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001252 // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
giuros016d109962019-01-07 17:47:19 +00001253 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
1254 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1255 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
1256 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001257
1258 // Multiply the weights reduction with INPUT_OFFSET
1259 acc0 = INPUT_OFFSET * acc0;
1260
1261 acc1 = acc0;
1262#endif // INPUT_OFFSET != 0
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001263
1264 // Load input values
1265 // z == 0
1266 // Clamp z_coord as for z = 0, it can be negative
1267 // z_coord is casted to unsigned int in order to use just a min() operation
1268 // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001269 z_coord = z - (int)CONV_PAD_TOP;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001270 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1271 offset = y_offset + (int4)(z_coord * src_stride_z);
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001272 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001273
1274 VEC_UCHAR values0 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1275 VEC_UCHAR values1 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1276 VEC_UCHAR values2 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1277 VEC_UCHAR values3 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1278
1279 // z == 1
1280 // z_coord can be only negative for z = 0 so we do not need to clamp it
1281 // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001282 z_coord = z - (int)CONV_PAD_TOP + 1;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001283 offset = y_offset + (int4)(z_coord * src_stride_z);
1284 VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1285 VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1286 VEC_UCHAR values6 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1287 VEC_UCHAR values7 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1288
1289 // z == 2
1290 // After z = 1 we can simply add src_stride_z to offset without updating z_coord
1291 // However offset can be out-of-bound so we need to check if it is greater than max_offset
1292 offset += (int4)src_stride_z;
Georgios Pinitas4e0d3812018-08-01 20:16:34 +01001293 offset = min(offset, (int4)max_offset);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001294 VEC_UCHAR values8 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
1295 VEC_UCHAR values9 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
1296 VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
1297 VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
1298
giuros016d109962019-01-07 17:47:19 +00001299 DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
1300 DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
1301 DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
1302 DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001303
giuros016d109962019-01-07 17:47:19 +00001304 DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
1305 DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
1306 DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1307 DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (uchar8)((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1308
1309 DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
1310 DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
1311 DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
1312 DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
1313
1314 DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
1315 DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
1316 DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
1317 DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (uchar8)((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001318
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001319#if defined(HAS_BIAS)
1320 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1321
1322 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001323
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001324 acc0 += bias_values;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001325 acc1 += bias_values;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001326
1327#endif // defined(HAS_BIAS)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001328
1329#if WEIGHTS_OFFSET != 0
1330 acc0 += WEIGHTS_OFFSET * sum0;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001331 acc1 += WEIGHTS_OFFSET * sum1;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001332#endif // WEIGHTS_OFFSET != 0
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001333
1334#if K_OFFSET != 0
1335 acc0 += (VEC_INT)K_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001336 acc1 += (VEC_INT)K_OFFSET;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001337
1338#endif // K_OFFSET != 0
1339
1340#if defined(REAL_MULTIPLIER)
1341
1342 acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1343 acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
1344
1345#else // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001346
Pablo Tello47104362019-02-27 13:32:51 +00001347 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1348 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001349
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001350#endif // defined(REAL_MULTIPLIER)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001351 acc0 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001352 acc1 += (VEC_INT)OUTPUT_OFFSET;
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001353
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001354 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001355 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001356
1357 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
1358 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001359
Georgios Pinitas37044642018-10-30 14:53:25 +00001360#if defined(DST_DEPTH)
1361 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1362#else /* defined(DST_DEPTH) */
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001363 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
Georgios Pinitas37044642018-10-30 14:53:25 +00001364#endif /* defined(DST_DEPTH) */
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001365
1366 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001367 (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001368 VSTORE(VEC_SIZE)
Georgios Pinitas60e98252018-10-22 16:17:20 +01001369 (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001370}
giuros016d109962019-01-07 17:47:19 +00001371#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001372
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001373#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
Giorgio Arenadfca60b2018-01-31 10:30:59 +00001374
Giorgio Arenaeff8d952018-07-02 15:29:57 +01001375#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
1376
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001377#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))