blob: 88e009d67833d2bd90eed074db2580adc7dcebdb [file] [log] [blame]
Dmitry Savenkod7295b72017-11-20 22:00:08 +07001/*
Giorgio Arena944d3f72018-01-16 15:38:35 +00002 * Copyright (c) 2017-2018 ARM Limited.
Dmitry Savenkod7295b72017-11-20 22:00:08 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "helpers_asymm.h"
26
Giorgio Arenadfca60b2018-01-31 10:30:59 +000027#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
Giorgio Arena287b5702018-02-16 11:01:04 +000028
Giorgio Arena99ac60b2018-02-16 15:17:23 +000029#if defined(FUSED_ACTIVATION)
30#define DATA_TYPE uchar
Giorgio Arenadfca60b2018-01-31 10:30:59 +000031#ifndef VEC_SIZE
Giorgio Arena99ac60b2018-02-16 15:17:23 +000032#define VEC_SIZE 8
Giorgio Arenadfca60b2018-01-31 10:30:59 +000033#endif /* VEC_SIZE */
Giorgio Arena99ac60b2018-02-16 15:17:23 +000034#include "activation_layer_qa8.cl"
35#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
36#else /* defined(FUSED_ACTIVATION) */
37#define ACTIVATION_FUNC(x) (x)
38#endif /* defined(FUSED_ACTIVATION) */
39
Giorgio Arenadfca60b2018-01-31 10:30:59 +000040#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X)
41
Giorgio Arena287b5702018-02-16 11:01:04 +000042#if CONV_STRIDE_X > 3
43#error "Stride X not supported"
44#endif /* CONV_STRIDE_X > 3 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +070045
46#if CONV_STRIDE_X == 1
Giorgio Arena287b5702018-02-16 11:01:04 +000047#define GET_VALUES(first_value, left, middle, right) \
48 ({ \
49 int8 temp0 = CONVERT(vload8(0, first_value), int8); \
50 int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
51 \
52 left = CONVERT(temp0.s01234567, int8); \
53 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
54 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
55 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070056#elif CONV_STRIDE_X == 2
Giorgio Arena287b5702018-02-16 11:01:04 +000057#define GET_VALUES(first_value, left, middle, right) \
58 ({ \
59 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
60 int temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
61 \
62 left = CONVERT(temp0.s02468ace, int8); \
63 middle = CONVERT(temp0.s13579bdf, int8); \
64 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
65 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070066#else /* CONV_STRIDE_X */
Giorgio Arena287b5702018-02-16 11:01:04 +000067#define GET_VALUES(first_value, left, middle, right) \
68 ({ \
69 int16 temp0 = CONVERT(vload16(0, first_value), int16); \
70 int8 temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
71 \
72 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
73 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
74 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
75 })
Dmitry Savenkod7295b72017-11-20 22:00:08 +070076#endif /* CONV_STRIDE_X */
77
Giorgio Arenadfca60b2018-01-31 10:30:59 +000078/** This function computes the depthwise convolution quantized.
Anthony Barbierf202e502017-11-23 18:02:04 +000079 *
80 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
81 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
82 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
83 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
84 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
85 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
86 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
87 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
88 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
89 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
90 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
91 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
92 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
93 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
94 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
95 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
96 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
97 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
98 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
99 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
100 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
101 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
102 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
103 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
104 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
105 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
106 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
107 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
Anthony Barbierf202e502017-11-23 18:02:04 +0000108 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700109
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000110__kernel void depthwise_convolution_3x3_quantized_nchw(
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700111 TENSOR3D_DECLARATION(src),
112 TENSOR3D_DECLARATION(dst),
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000113 TENSOR3D_DECLARATION(weights)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700114#if defined(HAS_BIAS)
Georgios Pinitas5b2191e2018-02-22 12:56:51 +0000115 ,
Giorgio Arena287b5702018-02-16 11:01:04 +0000116 VECTOR_DECLARATION(biases)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700117#endif //defined(HAS_BIAS)
Giorgio Arena287b5702018-02-16 11:01:04 +0000118)
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700119{
120 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
121 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
122 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
123#if defined(HAS_BIAS)
124 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700125
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700126 int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
127#endif //defined(HAS_BIAS)
128
Giorgio Arena76572242018-04-04 17:44:26 +0100129 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
130
Giorgio Arena287b5702018-02-16 11:01:04 +0000131 uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
132 uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
133 uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700134
Giorgio Arena287b5702018-02-16 11:01:04 +0000135 int8 values0 = 0;
136 int8 sum0 = 0;
137#if CONV_STRIDE_Y == 1
138 int8 values1 = 0;
139 int8 sum1 = 0;
140#endif /* CONV_STRIDE_Y */
141
142 // Row0
143 int8 left, middle, right;
144 GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
145 values0 += left * (int8)(w0.s0);
146 values0 += middle * (int8)(w0.s1);
147 values0 += right * (int8)(w0.s2);
148
149#if WEIGHTS_OFFSET != 0
150 sum0 += left + middle + right;
151#endif /* WEIGHTS_OFFSET != 0 */
152
153 // Row1
154 GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
155 values0 += left * (int8)(w1.s0);
156 values0 += middle * (int8)(w1.s1);
157 values0 += right * (int8)(w1.s2);
158#if CONV_STRIDE_Y == 1
159 values1 += left * (int8)(w0.s0);
160 values1 += middle * (int8)(w0.s1);
161 values1 += right * (int8)(w0.s2);
162#endif /* CONV_STRIDE_Y == 1 */
163
164#if WEIGHTS_OFFSET != 0
165 int8 tmp = left + middle + right;
166 sum0 += tmp;
167#if CONV_STRIDE_Y == 1
168 sum1 += tmp;
169#endif /* CONV_STRIDE_Y == 1 */
170#endif /* WEIGHTS_OFFSET != 0 */
171
172 // Row2
173 GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
174 values0 += left * (int8)(w2.s0);
175 values0 += middle * (int8)(w2.s1);
176 values0 += right * (int8)(w2.s2);
177#if CONV_STRIDE_Y == 1
178 values1 += left * (int8)(w1.s0);
179 values1 += middle * (int8)(w1.s1);
180 values1 += right * (int8)(w1.s2);
181#endif /* CONV_STRIDE_Y == 1 */
182
183#if WEIGHTS_OFFSET != 0
184 tmp = left + middle + right;
185 sum0 += tmp;
186#if CONV_STRIDE_Y == 1
187 sum1 += tmp;
188#endif /* CONV_STRIDE_Y == 1 */
189#endif /* WEIGHTS_OFFSET != 0 */
190
191#if CONV_STRIDE_Y == 1
192 // Row3
193 GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
194 values1 += left * (int8)(w2.s0);
195 values1 += middle * (int8)(w2.s1);
196 values1 += right * (int8)(w2.s2);
197
198#if WEIGHTS_OFFSET != 0
199 sum1 += left + middle + right;
200#endif /* WEIGHTS_OFFSET != 0 */
201#endif /* CONV_STRIDE_Y == 1 */
202
203#if defined(HAS_BIAS)
204 values0 += (int8)(bias_value);
205#if CONV_STRIDE_Y == 1
206 values1 += (int8)(bias_value);
207#endif /* CONV_STRIDE_Y == 1 */
208#endif //defined(HAS_BIAS)
209
210#if WEIGHTS_OFFSET != 0
211 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
212#if CONV_STRIDE_Y == 1
213 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
214#endif /* CONV_STRIDE_Y == 1 */
215#endif /* WEIGHTS_OFFSET != 0 */
216
217#if INPUT_OFFSET != 0
218 ushort sum_weights = 0;
219 ushort3 tmp_we = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
220 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
221 values0 += sum_weights * (int8)(INPUT_OFFSET);
222#if CONV_STRIDE_Y == 1
223 values1 += sum_weights * (int8)(INPUT_OFFSET);
224#endif /* CONV_STRIDE_Y == 1 */
225#endif /* INPUT_OFFSET != 0 */
226
227#if K_OFFSET != 0
228 values0 += (int8)(K_OFFSET);
229#if CONV_STRIDE_Y == 1
230 values1 += (int8)(K_OFFSET);
231#endif /* CONV_STRIDE_Y == 1 */
232#endif /* K_OFFSET != 0 */
233
234 values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
235 values0 += (int8)OUTPUT_OFFSET;
236 uchar8 res0 = convert_uchar8_sat(values0);
237 res0 = max(res0, (uchar8)0);
238 res0 = min(res0, (uchar8)255);
239
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000240 vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
Giorgio Arena287b5702018-02-16 11:01:04 +0000241#if CONV_STRIDE_Y == 1
242
243 values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
244 values1 += (int8)OUTPUT_OFFSET;
245 uchar8 res1 = convert_uchar8_sat(values1);
246 res1 = max(res1, (uchar8)0);
247 res1 = min(res1, (uchar8)255);
248
Giorgio Arena99ac60b2018-02-16 15:17:23 +0000249 vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
Giorgio Arena287b5702018-02-16 11:01:04 +0000250#endif /* CONV_STRIDE_Y == 1 */
Dmitry Savenkod7295b72017-11-20 22:00:08 +0700251}
Giorgio Arena287b5702018-02-16 11:01:04 +0000252
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000253#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) */
254
Giorgio Arenafa23f112018-06-19 11:27:38 +0100255#if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000256
257#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
258
259#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
260#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
Giorgio Arenafa23f112018-06-19 11:27:38 +0100261#define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000262
Giorgio Arenafa23f112018-06-19 11:27:38 +0100263#define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_USHORT) * CONVERT(y, VEC_USHORT), VEC_INT)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000264
265#if WEIGHTS_OFFSET != 0
Giorgio Arenafa23f112018-06-19 11:27:38 +0100266#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
267 ({ \
268 sum += CONVERT(x, VEC_INT); \
269 MULTIPLY_ADD(x, y, acc); \
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000270 })
271#else /* WEIGHTS_OFFSET != 0 */
Giorgio Arenafa23f112018-06-19 11:27:38 +0100272#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000273#endif /* WEIGHTS_OFFSET != 0 */
274
275/** This function computes the depthwise convolution quantized.
276 *
277 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
278 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
279 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
281 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
283 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
284 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
285 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
286 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
287 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
289 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
291 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
292 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
293 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
294 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
295 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
297 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
298 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
299 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
300 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
301 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
302 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
303 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
304 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
305 */
306
307__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
308 TENSOR3D_DECLARATION(src),
309 TENSOR3D_DECLARATION(dst),
310 TENSOR3D_DECLARATION(weights),
311#if defined(HAS_BIAS)
312 VECTOR_DECLARATION(biases)
313#endif /* defined(HAS_BIAS) */
314)
315{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100316 int x = get_global_id(0);
317 int y = get_global_id(1);
318 int z = get_global_id(2);
319
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000320 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
321 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
322#if defined(HAS_BIAS)
323 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
324
325 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
326#endif /* defined(HAS_BIAS) */
327
Giorgio Arenafa23f112018-06-19 11:27:38 +0100328 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x;
329 int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT;
330 int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP;
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000331
332 VEC_INT sum_we = 0;
333 VEC_INT acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
334 VEC_INT sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
335
336 // z == 0
337 VEC_UCHAR w0, w1, w2;
338 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
339 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
340 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
341
342#if INPUT_OFFSET != 0
343 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
344#endif /* INPUT_OFFSET != 0 */
345
Giorgio Arenafa23f112018-06-19 11:27:38 +0100346 int valid_z = z_coord;
347 int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1
348 valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
349 valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000350
Giorgio Arenafa23f112018-06-19 11:27:38 +0100351 VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
352 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000353
Giorgio Arenafa23f112018-06-19 11:27:38 +0100354 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
355 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
356 MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000357
Giorgio Arenafa23f112018-06-19 11:27:38 +0100358 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
359 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
360 MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);
361 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000362
Giorgio Arenafa23f112018-06-19 11:27:38 +0100363 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
364 MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);
365 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
366 MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000367
Giorgio Arenafa23f112018-06-19 11:27:38 +0100368 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
369 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
370 MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);
371
372 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);
373 MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000374
375 weights.ptr += weights_stride_z;
376
377 // z == 1
378 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
379 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
380 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
381
382#if INPUT_OFFSET != 0
383 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
384#endif /* INPUT_OFFSET != 0 */
385
Giorgio Arenafa23f112018-06-19 11:27:38 +0100386 // Only unit pad_top/bottom allowed, this can never be out of bound
387 valid_z = z_coord + 1;
388 valid_y = y_coord;
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000389
Giorgio Arenafa23f112018-06-19 11:27:38 +0100390 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
391 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000392
Giorgio Arenafa23f112018-06-19 11:27:38 +0100393 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
394 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
395 MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000396
Giorgio Arenafa23f112018-06-19 11:27:38 +0100397 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
398 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
399 MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);
400 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000401
Giorgio Arenafa23f112018-06-19 11:27:38 +0100402 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
403 MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);
404 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
405 MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000406
Giorgio Arenafa23f112018-06-19 11:27:38 +0100407 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
408 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
409 MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);
410
411 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);
412 MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000413
414 weights.ptr += weights_stride_z;
415
416 // z == 2
417 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
418 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
419 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
420
421#if INPUT_OFFSET != 0
422 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
423#endif /* INPUT_OFFSET != 0 */
424
Giorgio Arenafa23f112018-06-19 11:27:38 +0100425 valid_z = z_coord + 2;
426 valid_y = select(y_coord, -1, (int8)valid_z < 0);
427 valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2);
428 valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000429
Giorgio Arenafa23f112018-06-19 11:27:38 +0100430 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
431 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000432
Giorgio Arenafa23f112018-06-19 11:27:38 +0100433 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
434 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
435 MULTIPLY_ADD_ACCUMULATE(values, w0, acc1, sum1);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000436
Giorgio Arenafa23f112018-06-19 11:27:38 +0100437 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
438 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
439 MULTIPLY_ADD_ACCUMULATE(values, w1, acc1, sum1);
440 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000441
Giorgio Arenafa23f112018-06-19 11:27:38 +0100442 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
443 MULTIPLY_ADD_ACCUMULATE(values, w2, acc1, sum1);
444 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
445 MULTIPLY_ADD_ACCUMULATE(values, w0, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000446
Giorgio Arenafa23f112018-06-19 11:27:38 +0100447 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
448 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
449 MULTIPLY_ADD_ACCUMULATE(values, w1, acc3, sum3);
450
451 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s5 * (int)src_stride_y + valid_z * src_stride_z);
452 MULTIPLY_ADD_ACCUMULATE(values, w2, acc3, sum3);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000453
454#if defined(HAS_BIAS)
455 acc0 += bias_values;
456 acc1 += bias_values;
457 acc2 += bias_values;
458 acc3 += bias_values;
459#endif /* defined(HAS_BIAS) */
460
461#if WEIGHTS_OFFSET != 0
462 acc0 += WEIGHTS_OFFSET * sum0;
463 acc1 += WEIGHTS_OFFSET * sum1;
464 acc2 += WEIGHTS_OFFSET * sum2;
465 acc3 += WEIGHTS_OFFSET * sum3;
466#endif /* WEIGHTS_OFFSET != 0 */
467
468#if INPUT_OFFSET != 0
469 VEC_INT offs = INPUT_OFFSET * sum_we;
470
471 acc0 += offs;
472 acc1 += offs;
473 acc2 += offs;
474 acc3 += offs;
475#endif /* INPUT_OFFSET != 0 */
476
477#if K_OFFSET != 0
478 acc0 += (VEC_INT)K_OFFSET;
479 acc1 += (VEC_INT)K_OFFSET;
480 acc2 += (VEC_INT)K_OFFSET;
481 acc3 += (VEC_INT)K_OFFSET;
482#endif /* K_OFFSET != 0 */
483
484 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
485 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
486 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
487 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
488
489 acc0 += (VEC_INT)OUTPUT_OFFSET;
490 acc1 += (VEC_INT)OUTPUT_OFFSET;
491 acc2 += (VEC_INT)OUTPUT_OFFSET;
492 acc3 += (VEC_INT)OUTPUT_OFFSET;
493
494 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
495 VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
496 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
497 VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
498
499 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
500 res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
501 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
502 res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
503
504 VSTORE(VEC_SIZE)
505 (res0, 0, dst.ptr + 0 * dst_stride_y);
506 VSTORE(VEC_SIZE)
507 (res1, 0, dst.ptr + 1 * dst_stride_y);
508 VSTORE(VEC_SIZE)
509 (res2, 0, dst.ptr + 2 * dst_stride_y);
510 VSTORE(VEC_SIZE)
511 (res3, 0, dst.ptr + 3 * dst_stride_y);
512}
513
514/** This function computes the depthwise convolution quantized.
515 *
516 * @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
517 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
518 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
519 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
520 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
521 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
522 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
523 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
524 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: QASYMM8
525 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
526 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
527 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
528 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
529 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
530 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
531 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
532 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8
533 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
534 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
535 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
536 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
537 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
538 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
539 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
540 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: QASYMM8
541 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
542 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
543 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
544 */
545
546__kernel void depthwise_convolution_3x3_quantized_nhwc_stride2(
547 TENSOR3D_DECLARATION(src),
548 TENSOR3D_DECLARATION(dst),
549 TENSOR3D_DECLARATION(weights),
550#if defined(HAS_BIAS)
551 VECTOR_DECLARATION(biases)
552#endif /* defined(HAS_BIAS) */
553)
554{
Giorgio Arenafa23f112018-06-19 11:27:38 +0100555 int x = get_global_id(0);
556 int y = get_global_id(1);
557 int z = get_global_id(2);
558
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000559 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
560 Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
561#if defined(HAS_BIAS)
562 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
563
564 VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
565#endif /* defined(HAS_BIAS) */
566
Giorgio Arenafa23f112018-06-19 11:27:38 +0100567 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_step_x;
568 int8 y_coord = (int8)(y * (src_step_y / src_stride_y)) + (int8)(0, 1, 2, 3, 4, 5, 0, 0) - CONV_PAD_LEFT;
569 int z_coord = z * (src_step_z / src_stride_z) - CONV_PAD_TOP;
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000570
571 VEC_INT sum_we = 0;
572 VEC_INT acc0 = 0, acc2 = 0;
573 VEC_INT sum0 = 0, sum2 = 0;
574
575 // z == 0
576 VEC_UCHAR w0, w1, w2;
577 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
578 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
579 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
580
581#if INPUT_OFFSET != 0
582 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
583#endif /* INPUT_OFFSET != 0 */
584
Giorgio Arenafa23f112018-06-19 11:27:38 +0100585 int valid_z = z_coord;
586 int8 valid_y = select(y_coord, -1, (int8)valid_z < 0); // If z < 0, set y to -1
587 valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2); // If z >= SRC_DIM_2, set y to SRC_DIM_2
588 valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1); // Clamp z coordinate
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000589
Giorgio Arenafa23f112018-06-19 11:27:38 +0100590 VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
591 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000592
Giorgio Arenafa23f112018-06-19 11:27:38 +0100593 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
594 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000595
Giorgio Arenafa23f112018-06-19 11:27:38 +0100596 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
597 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
598 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000599
Giorgio Arenafa23f112018-06-19 11:27:38 +0100600 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
601 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
602
603 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
604 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000605
606 weights.ptr += weights_stride_z;
607
608 // z == 1
609 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
610 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
611 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
612
613#if INPUT_OFFSET != 0
614 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
615#endif /* INPUT_OFFSET != 0 */
616
Giorgio Arenafa23f112018-06-19 11:27:38 +0100617 // Only unit pad_top/bottom allowed, this can never be out of bound
618 valid_z = z_coord + 1;
619 valid_y = y_coord;
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000620
Giorgio Arenafa23f112018-06-19 11:27:38 +0100621 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
622 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000623
Giorgio Arenafa23f112018-06-19 11:27:38 +0100624 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
625 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000626
Giorgio Arenafa23f112018-06-19 11:27:38 +0100627 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
628 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
629 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000630
Giorgio Arenafa23f112018-06-19 11:27:38 +0100631 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
632 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
633
634 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
635 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000636
637 weights.ptr += weights_stride_z;
638
639 // z == 2
640 w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
641 w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
642 w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
643
644#if INPUT_OFFSET != 0
645 sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
646#endif /* INPUT_OFFSET != 0 */
647
Giorgio Arenafa23f112018-06-19 11:27:38 +0100648 valid_z = z_coord + 2;
649 valid_y = select(y_coord, -1, (int8)valid_z < 0);
650 valid_y = select(valid_y, SRC_DIM_1, (int8)valid_z >= SRC_DIM_2);
651 valid_z = clamp(valid_z, 0, SRC_DIM_2 - 1);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000652
Giorgio Arenafa23f112018-06-19 11:27:38 +0100653 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s0 * (int)src_stride_y + valid_z * src_stride_z);
654 MULTIPLY_ADD_ACCUMULATE(values, w0, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000655
Giorgio Arenafa23f112018-06-19 11:27:38 +0100656 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s1 * (int)src_stride_y + valid_z * src_stride_z);
657 MULTIPLY_ADD_ACCUMULATE(values, w1, acc0, sum0);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000658
Giorgio Arenafa23f112018-06-19 11:27:38 +0100659 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s2 * (int)src_stride_y + valid_z * src_stride_z);
660 MULTIPLY_ADD_ACCUMULATE(values, w2, acc0, sum0);
661 MULTIPLY_ADD_ACCUMULATE(values, w0, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000662
Giorgio Arenafa23f112018-06-19 11:27:38 +0100663 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s3 * (int)src_stride_y + valid_z * src_stride_z);
664 MULTIPLY_ADD_ACCUMULATE(values, w1, acc2, sum2);
665
666 values = VLOAD(VEC_SIZE)(0, src_addr + valid_y.s4 * (int)src_stride_y + valid_z * src_stride_z);
667 MULTIPLY_ADD_ACCUMULATE(values, w2, acc2, sum2);
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000668
669#if defined(HAS_BIAS)
670 acc0 += bias_values;
671 acc2 += bias_values;
672#endif /* defined(HAS_BIAS) */
673
674#if WEIGHTS_OFFSET != 0
675 acc0 += WEIGHTS_OFFSET * sum0;
676 acc2 += WEIGHTS_OFFSET * sum2;
677#endif /* WEIGHTS_OFFSET != 0 */
678
679#if INPUT_OFFSET != 0
680 VEC_INT offs = INPUT_OFFSET * sum_we;
681
682 acc0 += offs;
683 acc2 += offs;
684#endif /* INPUT_OFFSET != 0 */
685
686#if K_OFFSET != 0
687 acc0 += (VEC_INT)K_OFFSET;
688 acc2 += (VEC_INT)K_OFFSET;
689#endif /* K_OFFSET != 0 */
690
691 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
692 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
693 acc0 += (VEC_INT)OUTPUT_OFFSET;
694 acc2 += (VEC_INT)OUTPUT_OFFSET;
695 VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
696 VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
697 res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
698 res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
699
700 VSTORE(VEC_SIZE)
701 (res0, 0, dst.ptr + 0 * dst_stride_y);
702 VSTORE(VEC_SIZE)
703 (res2, 0, dst.ptr + 1 * dst_stride_y);
704}
705
Giorgio Arenafa23f112018-06-19 11:27:38 +0100706#endif /* defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) */
Giorgio Arenadfca60b2018-01-31 10:30:59 +0000707
708#endif /* defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */