blob: fa08b149e4b3735c0cb6b3e27b667ef85a162c42 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00002 * Copyright (c) 2017-2019 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010024#include "gemm_helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000026#include "repeat.h"
Gian Marco05288a22017-11-21 10:57:50 +000027
Georgios Pinitasdaa38552018-08-28 17:43:18 +010028#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
29#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010030#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010031#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010032#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010033#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
34#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenac50da382018-07-26 15:50:09 +010035
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010036#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
37
38/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. These macros use the dot8 instruction */
39#define ARM_DOT1(a, b, c) \
40 ({ \
41 ARM_DOT((uchar4)(a, (uchar3)0), (uchar4)(b, (uchar3)0), c); \
42 })
43#define ARM_DOT2(a, b, c) \
44 ({ \
45 ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
46 })
47#define ARM_DOT3(a, b, c) \
48 ({ \
49 ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
50 })
51#define ARM_DOT4(a, b, c) \
52 ({ \
53 ARM_DOT(a, b, c); \
54 })
55#define ARM_DOT8(a, b, c) \
56 ({ \
57 ARM_DOT4((a.lo), (b.lo), c); \
58 ARM_DOT4((a.hi), (b.hi), c); \
59 })
60#define ARM_DOT16(a, b, c) \
61 ({ \
62 ARM_DOT8((a.lo), (b.lo), c); \
63 ARM_DOT8((a.hi), (b.hi), c); \
64 })
65
66#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
67
68/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
Georgios Pinitas705fd3d2019-06-17 17:23:22 +010069#define ARM_DOT1(a, b, c) \
70 ({ \
71 c += (uint)a * b; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010072 })
73#define ARM_DOT2(a, b, c) \
74 ({ \
Georgios Pinitas705fd3d2019-06-17 17:23:22 +010075 c += (uint)a.s0 * b.s0; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010076 c += (uint)a.s1 * b.s1; \
77 })
78#define ARM_DOT3(a, b, c) \
79 ({ \
80 ARM_DOT2(a, b, c); \
81 c += (uint)a.s2 * b.s2; \
82 })
83#define ARM_DOT4(a, b, c) \
84 ({ \
85 ARM_DOT3(a, b, c); \
86 c += (uint)a.s3 * b.s3; \
87 })
88#define ARM_DOT8(a, b, c) \
89 ({ \
90 ARM_DOT4((a.lo), (b.lo), c); \
91 ARM_DOT4((a.hi), (b.hi), c); \
92 })
93#define ARM_DOT16(a, b, c) \
94 ({ \
95 ARM_DOT8((a.lo), (b.lo), c); \
96 ARM_DOT8((a.hi), (b.hi), c); \
97 })
98#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
99
100/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
101#define ARM_DOT_K0X2(k0, a, b, c) \
102 ({ \
103 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
104 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
105 })
106#define ARM_DOT_K0X3(k0, a, b, c) \
107 ({ \
108 ARM_DOT_K0X2(k0, a, b, c); \
109 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
110 })
111#define ARM_DOT_K0X4(k0, a, b, c) \
112 ({ \
113 ARM_DOT_K0X3(k0, a, b, c); \
114 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
115 })
116#define ARM_DOT_K0X8(k0, a, b, c) \
117 ({ \
118 ARM_DOT_K0X4(k0, a, b, c); \
119 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
120 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
121 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
122 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
123 })
124#define ARM_DOT_K0X16(k0, a, b, c) \
125 ({ \
126 ARM_DOT_K0X8(k0, a, b, c); \
127 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
128 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
129 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
130 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
131 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
132 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
133 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
134 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
135 })
136
Georgios Pinitas705fd3d2019-06-17 17:23:22 +0100137/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100138#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \
139 ({ \
140 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
141 })
142#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \
143 ({ \
144 ARM_MM_K0XN0X1(n0, k0, a, b, c); \
145 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
146 })
147#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \
148 ({ \
149 ARM_MM_K0XN0X2(n0, k0, a, b, c); \
150 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
151 })
152#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \
153 ({ \
154 ARM_MM_K0XN0X3(n0, k0, a, b, c); \
155 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
156 })
157#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \
158 ({ \
159 ARM_MM_K0XN0X4(n0, k0, a, b, c); \
160 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
161 })
162#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \
163 ({ \
164 ARM_MM_K0XN0X5(n0, k0, a, b, c); \
165 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
166 })
167#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \
168 ({ \
169 ARM_MM_K0XN0X6(n0, k0, a, b, c); \
170 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
171 })
172#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \
173 ({ \
174 ARM_MM_K0XN0X7(n0, k0, a, b, c); \
175 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
176 })
177
178#define ARM_DOT_K0(k0, a, b, c) \
179 ({ \
180 CONCAT(ARM_DOT, k0) \
181 ((a), (b), (c)); \
182 })
183
184#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
185 ({ \
186 CONCAT(ARM_DOT_K0X, n0) \
187 (k0, (a), b, (c)); \
188 })
189
190#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
191 ({ \
192 CONCAT(ARM_MM_K0XN0X, m0) \
193 (n0, k0, a, b, c); \
194 })
195
Gian Marco05288a22017-11-21 10:57:50 +0000196#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
197#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
198#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
199#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
200/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
201 *
202 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
203 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100204 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
205 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
206 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
207 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
208 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
209 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
210 *
Gian Marco05288a22017-11-21 10:57:50 +0000211 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
212 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
213 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
214 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
215 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
216 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
217 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
218 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
219 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
220 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
221 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
222 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
223 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
224 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
225 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
226 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
227 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
228 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100229 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
230 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
231 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
232 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
233 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000234 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000235__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
236 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100237 IMAGE_DECLARATION(dst),
238 uint src0_stride_z,
239 uint src1_stride_z,
240 uint dst_stride_z
241#if defined(REINTERPRET_INPUT_AS_3D)
242 ,
243 uint src_cross_plane_pad
244#endif // REINTERPRET_INPUT_AS_3D
245#if defined(REINTERPRET_OUTPUT_AS_3D)
246 ,
247 uint dst_cross_plane_pad
248#endif // REINTERPRET_OUTPUT_AS_3D
249 )
Gian Marco05288a22017-11-21 10:57:50 +0000250{
251 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
252
253 // Compute starting address for matrix A and Matrix B
254 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
255
256 // Update address for the matrix A
257 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
258
259 // Update address for the matrix B
260 src_addr.s1 += idx;
261
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100262#if defined(REINTERPRET_INPUT_AS_3D)
263 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
264 // in order to take into account the presence of possible cross plane paddings
265 //
266 // | |
267 // | plane0 |
268 // | |
269 // |__________________|
270 // |******************|
271 // | cross_plane_pad |
272 // |******************|
273 // | |
274 // | plane1 |
275 // | |
276 // |__________________|
277
278 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
279 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
280 zin = min(DEPTH_GEMM3D - 1, zin);
281
282 // Add offset due to the cross plane paddings
283 zin *= (src_cross_plane_pad * src0_stride_y);
284
285 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
286 // multiply src0_stride_z by DEPTH_GEMM3D
287 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
288
289#else // defined(REINTERPRET_INPUT_AS_3D)
290
291 // Add offset for batched GEMM
292 src_addr.s0 += get_global_id(2) * src0_stride_z;
293
294#endif // defined(REINTERPRET_INPUT_AS_3D)
295
296#if defined(MATRIX_B_DEPTH)
297 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
298 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
299#else // defined(MATRIX_B_DEPTH)
300 src_addr.s1 += get_global_id(2) * src1_stride_z;
301#endif // defined(MATRIX_B_DEPTH)
302
Gian Marco05288a22017-11-21 10:57:50 +0000303 int end_row_vec_a = src_addr.s0 + COLS_A;
304
305 VECTOR_UINT acc0 = 0;
306#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
307 VECTOR_UINT acc1 = 0;
308#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
309#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
310 VECTOR_UINT acc2 = 0;
311#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
312#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
313 VECTOR_UINT acc3 = 0;
314#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000315#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
316 VECTOR_UINT acc4 = 0;
317#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000318
319 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
320 {
321 // Load values from matrix A
322 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
323#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
324 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
325#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
326#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
327 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
328#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
329#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
330 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
331#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000332#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
333 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
334#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000335 // Load values from matrix B
336 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
337 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
338
339 // Accumulate
340 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
341 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
342#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
343 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
344 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
345#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
346#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
347 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
348 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
351 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
352 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
353#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000354#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
355 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
356 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
357#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000358 }
359
360 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
361 {
362 // Load values from matrix A
363 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
364#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
365 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
366#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
367#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
368 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
369#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
370#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
371 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
372#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000373#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
374 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
375#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000376 // Load values from matrix B
377 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
378
379 // Accumulate
380 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
381#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
382 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
383#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
384#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
385 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
386#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
387#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
388 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
389#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000390#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
391 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
392#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000393 }
394
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100395 const int z = get_global_id(2);
396
Gian Marco05288a22017-11-21 10:57:50 +0000397 // Compute destination address
398 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
399
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100400#if defined(REINTERPRET_OUTPUT_AS_3D)
401 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
402 // in order to take into account the presence of possible cross plane paddings
403 //
404 // | |
405 // | plane0 |
406 // | |
407 // |__________________|
408 // |******************|
409 // | cross_plane_pad |
410 // |******************|
411 // | |
412 // | plane1 |
413 // | |
414 // |__________________|
415
416 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
417 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
418 zout = min(DEPTH_GEMM3D - 1, zout);
419
420 // Add offset due to the cross plane paddings
421 zout *= (dst_cross_plane_pad * dst_stride_y);
422
423 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
424 // multiply dst_stride_z by DEPTH_GEMM3D
425 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
426
Gian Marco05288a22017-11-21 10:57:50 +0000427 // Store the result
428 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100429 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco05288a22017-11-21 10:57:50 +0000430#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
431 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100432 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco05288a22017-11-21 10:57:50 +0000433#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
434#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
435 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100436 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco05288a22017-11-21 10:57:50 +0000437#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
438#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
439 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100440 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco05288a22017-11-21 10:57:50 +0000441#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000442#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
443 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100444 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +0000445#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100446
447#else // defined(REINTERPRET_OUTPUT_AS_3D)
448 // Add offset for batched GEMM
449 dst.ptr += z * dst_stride_z;
450
451 // Store the result
452 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
453 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
454#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
455 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
456 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
457#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
458#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
459 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
460 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
461#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
462#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
463 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
464 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
465#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
466#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
467 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
468 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
469#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
470#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +0000471}
Gian Marco05288a22017-11-21 10:57:50 +0000472#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
473
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000474#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100475/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000476 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
477 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
478 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000479 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
480 * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000481 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
482 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
483 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
484 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
485 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
486 * @note Only the following configurations of M0, N0 and K0 are currently supported:
487 * - M0 = 2, 3, 4, 5, 6, 7, 8
488 * - N0 = 2, 3, 4, 8, 16
489 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000490 * - V0 >= 1
491 * - H0 >= 1
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000492 *
493 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
494 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
495 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
496 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
497 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
498 *
499 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
500 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
501 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
502 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
503 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
504 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
505 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
506 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
507 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
508 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
509 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
510 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
511 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
512 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
513 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
514 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
515 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
516 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
517 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
518 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
519 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
520 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
521 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
522 */
523__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
524 IMAGE_DECLARATION(rhs),
525 IMAGE_DECLARATION(dst),
526 uint k,
527 uint lhs_stride_z,
528 uint rhs_stride_z,
529 uint dst_stride_z
530#if defined(REINTERPRET_OUTPUT_AS_3D)
531 ,
532 uint dst_cross_plane_pad
533#endif // REINTERPRET_OUTPUT_AS_3D
534 )
535{
536 // Block size
537#define LHS_BLOCK_SIZE ((K0) * (M0))
538
539#if defined(LHS_INTERLEAVE)
540#define LHS_OFFSET_X (K0)
541#define LHS_STEP_X ((K0) * (V0))
542#define LHS_STEP_LOOP (1)
543#else // defined(INTERLEAVE)
544#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
545#define LHS_STEP_X (K0)
546#define LHS_STEP_LOOP (V0)
547#endif // defined(INTERLEAVE)
548
549 // Block size
550#define RHS_BLOCK_SIZE ((K0) * (N0))
551
552 // RHS offset and step X
553#if defined(RHS_INTERLEAVE)
554#define RHS_OFFSET_X (K0)
555#define RHS_STEP_X ((K0) * (H0))
556#define RHS_STEP_LOOP (1)
557#else // defined(RHS_INTERLEAVE)
558#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
559#define RHS_STEP_X (K0)
560#define RHS_STEP_LOOP (H0)
561#endif // defined(RHS_INTERLEAVE)
562
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100563 uint x = get_global_id(0);
564 uint y = get_global_id(1);
565 uint z = get_global_id(2);
566
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000567#if defined(DUMMY_WORK_ITEMS)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100568 if((x * N0 >= N) || (y * M0 >= M))
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +0000569 {
570 return;
571 }
572#endif // defined(DUMMY_WORK_ITEMS)
573
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000574 // Compute LHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100575 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000576
577 // Compute RHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100578 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000579
580#if defined(MATRIX_B_DEPTH)
581 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100582 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000583#else // defined(MATRIX_B_DEPTH)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100584 rhs_addr += z * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000585#endif // defined(MATRIX_B_DEPTH)
586
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100587 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
588 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
589
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000590 // Initialize the accumulators
591 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
592
593 for(int i = 0; i < k; i += K0)
594 {
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000595 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100596 LOAD_BLOCK(M0, K0, uchar, a, lhs_addr, 0, LHS_STEP_X, zlhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000597
598 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100599 LOAD_BLOCK(N0, K0, uchar, b, rhs_addr, 0, RHS_STEP_X, zrhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000600
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100601 // Partial matrix multiplication M0,N0,K0
602 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000603
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100604 // Update address
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000605 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
606 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
607 }
608
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100609 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000610
611 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
612
613#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100614 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
615 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000616
617 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
618 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100619 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000620
621#else // defined(REINTERPRET_OUTPUT_AS_3D)
622
623 // Add offset for batched GEMM
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100624 dst_addr += z * dst_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000625
626#endif // defined(REINTERPRET_OUTPUT_AS_3D)
627
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100628 // Convert and store output block
629 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000630
631#undef LHS_BLOCK_SIZE
632#undef LHS_OFFSET_X
633#undef LHS_STEP_X
634#undef RHS_BLOCK_SIZE
635#undef RHS_OFFSET_X
636#undef RHS_STEP_X
637}
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000638#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
639
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000640#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
641
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000642/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
643 * The LHS matrix is NOT reshaped
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100644 * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000645 *
646 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
647 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
648 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
649 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
650 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
651 * @note Only the following configurations of M0, N0 and K0 are currently supported:
652 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
653 * - N0 = 2, 3, 4, 8, 16
654 * - K0 = 2, 3, 4, 8, 16
655 * - H0 >= 1
656 *
657 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
658 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
659 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
660 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
661 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
662 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
663 *
664 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
665 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
666 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
667 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
668 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
669 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
670 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
671 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
672 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
673 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
674 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
675 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
676 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
677 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
678 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
679 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
680 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
681 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
682 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
683 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
684 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
685 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
686 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
687 */
688__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
689 IMAGE_DECLARATION(rhs),
690 IMAGE_DECLARATION(dst),
691 uint lhs_stride_z,
692 uint rhs_stride_z,
693 uint dst_stride_z
694#if defined(REINTERPRET_INPUT_AS_3D)
695 ,
696 uint lhs_cross_plane_pad
697#endif // REINTERPRET_INPUT_AS_3D
698#if defined(REINTERPRET_OUTPUT_AS_3D)
699 ,
700 uint dst_cross_plane_pad
701#endif // REINTERPRET_OUTPUT_AS_3D
702 )
703{
704 // Block size
705#define RHS_BLOCK_SIZE ((K0) * (N0))
706
707 // RHS offset and step X
708#if defined(RHS_INTERLEAVE)
709#define RHS_OFFSET_X (K0)
710#define RHS_STEP_X ((K0) * (H0))
711#define RHS_STEP_LOOP (1)
712#else // defined(RHS_INTERLEAVE)
713#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
714#define RHS_STEP_X (K0)
715#define RHS_STEP_LOOP (H0)
716#endif // defined(RHS_INTERLEAVE)
717
718 uint x = get_global_id(0);
719 uint y = get_global_id(1);
720 uint z = get_global_id(2);
721
Gian Marco Iodice86cfffe2019-04-02 11:02:20 +0100722#if defined(DUMMY_WORK_ITEMS)
723 if((x * N0 >= N) || (y * M0 >= M))
724 {
725 return;
726 }
727#endif // defined(DUMMY_WORK_ITEMS)
728
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000729 // Compute LHS matrix address
730 uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
731
732 // Compute RHS matrix address
733 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
734
735#if defined(MATRIX_B_DEPTH)
736 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
737 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
738#else // defined(MATRIX_B_DEPTH)
739 rhs_offset += z * rhs_stride_z;
740#endif // defined(MATRIX_B_DEPTH)
741
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100742 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
743 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000744
745#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100746 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
747 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000748
749 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
750 // multiply lhs_stride_z by DEPTH_GEMM3D
751 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
752
753#else // defined(REINTERPRET_INPUT_AS_3D)
754
755 // Add offset for batched GEMM
756 lhs_offset += z * lhs_stride_z;
757
758#endif // defined(REINTERPRET_INPUT_AS_3D)
759
760 // Initialize the accumulators
761 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
762
763 for(int i = 0; i < K; i += K0)
764 {
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000765 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100766 LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000767
768 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100769 LOAD_BLOCK(N0, K0, uchar, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000770
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100771 // Partial matrix multiplication M0,N0,K0
772 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000773
774 lhs_offset += K0;
775 rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
776 }
777
778 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
779
780 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
781
782#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000783 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100784 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000785
786 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
787 // multiply dst_stride_z by DEPTH_GEMM3D
788 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
789
790#else // defined(REINTERPRET_OUTPUT_AS_3D)
791
792 // Add offset for batched GEMM
793 dst_addr += z * dst_stride_z;
794
795#endif // defined(REINTERPRET_OUTPUT_AS_3D)
796
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100797 // Convert and store output block
798 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice62251f72019-03-11 16:07:12 +0000799
800#undef RHS_BLOCK_SIZE
801#undef RHS_OFFSET_X
802#undef RHS_STEP_X
803}
804#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
805
Gian Marco Iodicee7510622019-06-03 17:28:17 +0100806#if defined(M0) && defined(N0) && defined(K0) && defined(K)
807
808/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
809 * The LHS matrix is NOT reshaped
810 * The RHS matrix is NOT reshaped
811 *
812 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
813 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
814 * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
815 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
816 * @note Only the following configurations of M0, N0 and K0 are currently supported:
817 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
818 * - N0 = 2, 3, 4, 8, 16
819 * - K0 = 2, 3, 4, 8, 16
820 *
821 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
822 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
823 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
824 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
825 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
826 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
827 *
828 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
829 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
830 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
831 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
832 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
833 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
834 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
835 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
836 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
837 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
838 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
839 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
840 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
841 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
842 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
843 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
844 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
845 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
846 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
847 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
848 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
849 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
850 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
851 */
852__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
853 IMAGE_DECLARATION(rhs),
854 IMAGE_DECLARATION(dst),
855 uint lhs_stride_z,
856 uint rhs_stride_z,
857 uint dst_stride_z
858#if defined(REINTERPRET_INPUT_AS_3D)
859 ,
860 uint lhs_cross_plane_pad
861#endif // REINTERPRET_INPUT_AS_3D
862#if defined(REINTERPRET_OUTPUT_AS_3D)
863 ,
864 uint dst_cross_plane_pad
865#endif // REINTERPRET_OUTPUT_AS_3D
866 )
867{
868 uint x = get_global_id(0);
869 uint y = get_global_id(1);
870 uint z = get_global_id(2);
871
872#if defined(DUMMY_WORK_ITEMS)
873 if((x * N0 >= N) || (y * M0 >= M))
874 {
875 return;
876 }
877#endif // defined(DUMMY_WORK_ITEMS)
878
879 // Compute LHS matrix address
880 uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
881
882 // Compute RHS matrix address
883 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
884
885#if defined(MATRIX_B_DEPTH)
886 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
887 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
888#else // defined(MATRIX_B_DEPTH)
889 rhs_offset += z * rhs_stride_z;
890#endif // defined(MATRIX_B_DEPTH)
891
892 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
893 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
894
895#if defined(REINTERPRET_INPUT_AS_3D)
896 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
897 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
898
899 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
900 // multiply lhs_stride_z by DEPTH_GEMM3D
901 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
902
903#else // defined(REINTERPRET_INPUT_AS_3D)
904
905 // Add offset for batched GEMM
906 lhs_offset += z * lhs_stride_z;
907
908#endif // defined(REINTERPRET_INPUT_AS_3D)
909
910 // Initialize the accumulators
911 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
912
913 int i = 0;
914
915 for(; i <= (K - K0); i += K0)
916 {
917 // Load values from LHS matrix
918 LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
919
920 // Load values from RHS matrix
921 LOAD_BLOCK(K0, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
922
923 // Transpose the values from RHS matrix
924 TRANSPOSE_K0XN0(K0, N0, b_t, b);
925
926 // Partial matrix multiplication M0,N0,K0
927 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
928
929 // Update the offset
930 lhs_offset += K0;
931 rhs_offset += K0 * rhs_stride_y;
932 }
933
934 // Left-over for loop
935 for(; i < K; ++i)
936 {
937 // Load values from LHS matrix
938 LOAD_BLOCK(M0, 1, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
939
940 // Load values from RHS matrix
941 LOAD_BLOCK(1, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
942
943 // Transpose the values from RHS matrix
944 TRANSPOSE_K0XN0(1, N0, b_t, b);
945
946 // Partial matrix multiplication M0,N0,1
947 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
948
949 // Update the offset
950 lhs_offset += 1;
951 rhs_offset += rhs_stride_y;
952 }
953
954 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
955
956 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
957
958#if defined(REINTERPRET_OUTPUT_AS_3D)
959 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
960 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
961
962 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
963 // multiply dst_stride_z by DEPTH_GEMM3D
964 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
965
966#else // defined(REINTERPRET_OUTPUT_AS_3D)
967
968 // Add offset for batched GEMM
969 dst_addr += z * dst_stride_z;
970
971#endif // defined(REINTERPRET_OUTPUT_AS_3D)
972
973 // Convert and store output block
974 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
975}
976#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
977
Gian Marco05288a22017-11-21 10:57:50 +0000978#if defined(COLS_A)
979/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
980 *
981 * @note This stage is needed to handle the offset of matrix product
982 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
983 *
984 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
985 *
986 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
987 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
988 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
989 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
990 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
991 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
992 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
993 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
994 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
995 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
996 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
997 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
998 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
999 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1000 */
1001__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
1002 IMAGE_DECLARATION(dst))
1003{
1004 // Compute source and destination addresses
1005 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1006 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1007
1008 uint4 sum_row_u32 = (uint4)0;
1009 uint sum_row = 0;
1010
1011 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1012
1013 int i = 0;
1014
1015 // This for loop performs 16 accumulations
1016 for(; i <= ((int)COLS_A - 16); i += 16)
1017 {
1018 const uchar16 a0_u8 = vload16(0, matrix_a + i);
1019
1020 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
1021 }
1022
1023 // This for loop performs the leftover accumulations
1024 for(; i < COLS_A; ++i)
1025 {
1026 sum_row += matrix_a[i];
1027 }
1028
1029 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
1030
1031 *((__global int *)dst.ptr) = (int)sum_row;
1032}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001033
1034#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1035/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
1036 *
1037 * @note This stage is needed to handle the offset of matrix product
1038 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1039 *
1040 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1041 *
1042 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1043 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1044 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1045 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1046 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1047 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1048 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1049 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1050 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1051 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1052 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1053 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1054 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1055 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1056 */
1057__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
1058 IMAGE_DECLARATION(dst))
1059{
1060 // Compute source and destination addresses
1061 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1062 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1063
1064 uint sum_row = 0;
1065
1066 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1067
1068 int i = 0;
1069
1070 // This for loop performs 16 accumulations
1071 for(; i <= ((int)COLS_A - 32); i += 32)
1072 {
1073 uchar16 a0_u8 = vload16(0, matrix_a + i);
1074
1075 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
1076 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
1077 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
1078 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
1079
1080 a0_u8 = vload16(1, matrix_a + i);
1081
1082 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
1083 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
1084 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
1085 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
1086 }
1087
1088 // This for loop performs the leftover accumulations
1089 for(; i < COLS_A; ++i)
1090 {
1091 sum_row += matrix_a[i];
1092 }
1093
1094 *((__global int *)dst.ptr) = (int)sum_row;
1095}
1096#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00001097#endif // defined(COLS_A)
1098
1099#if defined(COLS_B) && defined(ROWS_B)
1100/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
1101 *
1102 * @note This stage is needed to handle the offset of matrix product
1103 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1104 *
1105 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
1106 *
1107 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1108 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1109 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1110 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1111 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1112 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1113 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1114 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1115 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1116 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1117 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1118 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1119 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1120 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1121 */
1122__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
1123 IMAGE_DECLARATION(dst))
1124{
1125 // Compute source and destination addresses
1126 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1127 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1128
1129 uint16 sum_col_u32 = (uint16)0;
1130
1131 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
1132
1133 int i = 0;
1134 // This for loop performs 4 accumulations
1135 for(; i <= ((int)ROWS_B - 4); i += 4)
1136 {
1137 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
1138 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
1139 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
1140 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
1141
1142 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
1143
1144 matrix_b += 4 * src_stride_y;
1145 }
1146
1147 // This for loop perfoms the leftover accumulations
1148 for(; i < (int)ROWS_B; ++i)
1149 {
1150 const uchar16 b0_u8 = vload16(0, matrix_b);
1151
1152 sum_col_u32 += convert_uint16(b0_u8);
1153
1154 matrix_b += src_stride_y;
1155 }
1156
1157 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
1158}
1159#endif // defined(COLS_B) && defined(ROWS_B)
1160
1161#if defined(K_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001162
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001163/* Helper function used to calculate the offset contribution after matrix multiplication.
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001164 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001165 * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001166 * and calculates the offset contribution of matrix A and matrix B.
1167 *
1168 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1169 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1170 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
1171 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
1172 *
1173 * @param[in] x get_global_id(0) * 4
1174 * @param[in] y get_global_id(1)
1175 * @param[in] z get_global_id(2)
1176 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1177 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1178 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1179 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1180 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1181 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1182 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1183 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1184 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1185 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1186 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1187 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1188 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1189 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1190 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1191 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
1192 */
1193inline int4 offset_contribution(
1194 int x,
1195 int y,
1196 int z
1197#if defined(A_OFFSET)
1198 ,
1199 IMAGE_DECLARATION(sum_col)
1200#endif // defined(A_OFFSET)
1201#if defined(B_OFFSET)
1202 ,
1203 IMAGE_DECLARATION(sum_row)
1204#endif // defined(B_OFFSET)
1205#if defined(ADD_BIAS)
1206 ,
1207 VECTOR_DECLARATION(biases)
1208#endif // defined(ADD_BIAS)
1209)
1210{
1211 int4 a_offset_s32 = (int4)0;
1212 int4 b_offset_s32 = (int4)0;
1213
1214 int batch_id = z;
1215#if defined(DEPTH_INPUT3D)
1216 batch_id /= (int)DEPTH_INPUT3D;
1217#endif // defined(DEPTH_INPUT3D)
1218
1219#if defined(A_OFFSET)
1220 // Compute the offset contribution due to A_OFFSET
1221 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
1222
1223 // Compute the offset contribution due to A_OFFSET
1224#if defined(SUM_COL_HAS_BATCHES)
1225 a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
1226#else // defined(SUM_COL_HAS_BATCHES)
1227 a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
1228#endif // defined(SUM_COL_HAS_BATCHES)
1229
1230 a_offset_s32 *= (int4)A_OFFSET;
1231#endif // defined(A_OFFSET)
1232
1233#if defined(B_OFFSET)
1234 // Compute the offset contribution due to A_OFFSET
1235 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
1236
1237 // Compute the offset contribution due to B_OFFSET
1238#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1239 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
1240#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1241 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
1242#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1243 b_offset_s32 *= (int4)B_OFFSET;
1244#endif // defined(B_OFFSET)
1245
1246#if defined(ADD_BIAS)
1247 // Add bias
1248 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
1249
1250 int4 biases_values = vload4(0, (__global int *)bias_addr);
1251 b_offset_s32 += (int4)biases_values;
1252#endif // defined(ADD_BIAS)
1253
1254 return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
1255}
1256
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001257/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
Gian Marco05288a22017-11-21 10:57:50 +00001258 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001259 * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
Gian Marco05288a22017-11-21 10:57:50 +00001260 * and adds to it the offset contribution of matrix A and matrix B in-place.
1261 *
1262 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1263 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1264 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07001265 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00001266 *
1267 * The final result is:
1268 *
1269 * mm_result[i][k] = mm_result[i][k] +
1270 * (sum_col[k] * A_OFFSET) +
1271 * (sum_row[i] * B_OFFSET) +
1272 * (K_OFFSET)
1273 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001274 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
1275 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
1276 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
1277 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1278 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
1279 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
1280 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
1281 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001282 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1283 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1284 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1285 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1286 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1287 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1288 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1289 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1290 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1291 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1292 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1293 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1294 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1295 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1296 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1297 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00001298 */
1299__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
1300#if defined(A_OFFSET)
1301 ,
1302 IMAGE_DECLARATION(sum_col)
1303#endif // defined(A_OFFSET)
1304#if defined(B_OFFSET)
1305 ,
1306 IMAGE_DECLARATION(sum_row)
1307#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001308#if defined(ADD_BIAS)
1309 ,
1310 VECTOR_DECLARATION(biases)
1311#endif // defined(ADD_BIAS))
Gian Marco05288a22017-11-21 10:57:50 +00001312 )
1313{
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001314 const int x = get_global_id(0) * 4;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001315 const int y = get_global_id(1);
1316 const int z = get_global_id(2);
1317
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001318 // Compute offset contribution
1319 int4 offset_term_s32 = offset_contribution(
1320 x, y, z
Gian Marco05288a22017-11-21 10:57:50 +00001321#if defined(A_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001322 ,
1323 sum_col_ptr,
1324 sum_col_stride_x,
1325 sum_col_step_x,
1326 sum_col_stride_y,
1327 sum_col_step_y,
1328 sum_col_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00001329#endif // defined(A_OFFSET)
Gian Marco05288a22017-11-21 10:57:50 +00001330#if defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001331 ,
1332 sum_row_ptr,
1333 sum_row_stride_x,
1334 sum_row_step_x,
1335 sum_row_stride_y,
1336 sum_row_step_y,
1337 sum_row_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00001338#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001339#if defined(ADD_BIAS)
1340 ,
1341 biases_ptr,
1342 biases_stride_x,
1343 biases_step_x,
1344 biases_offset_first_element_in_bytes
1345#endif // defined(ADD_BIAS)
1346 );
Gian Marco05288a22017-11-21 10:57:50 +00001347
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001348 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00001349
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001350 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00001351
1352 // Add the offset terms to GEMM's result
1353 in_s32 += offset_term_s32;
1354
1355 // Store the result with the offset contribution
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001356 vstore4(in_s32, 0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00001357}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001358
1359#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
1360/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
1361 *
1362 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
1363 *
1364 *
1365 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1366 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1367 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
1368 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
1369 *
1370 * The result before the output stage is:
1371 *
1372 * mm_result[i][k] = mm_result[i][k] +
1373 * (sum_col[k] * A_OFFSET) +
1374 * (sum_row[i] * B_OFFSET) +
1375 * (K_OFFSET)
1376 *
1377 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
1378 *
1379 * -# Add offset terms to final result
1380 * -# Multiply each entry of result by result_mult_int
1381 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
1382 * -# Shift the int32 accumulator by result_shift
1383 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
1384 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1385 *
1386 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1387 *
1388 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1389 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1390 * These values can be used to implement "rectified linear unit" activation functions
1391 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001392 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
1393 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
1394 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
1395 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1396 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
1397 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
1398 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
1399 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1400 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1401 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1402 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1403 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1404 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1405 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1406 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1407 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1408 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1409 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1410 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1411 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1412 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1413 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1414 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1415 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
1416 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1417 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1418 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1419 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1420 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1421 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1422 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1423 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1424 * @param[in] result_multipliers_ptr (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
1425 * @param[in] result_multipliers_stride_x (Optional) Stride of the output multipliers vector in X dimension (in bytes)
1426 * @param[in] result_multipliers_step_x (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
1427 * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
1428 * @param[in] result_shifts_ptr (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
1429 * @param[in] result_shifts_stride_x (Optional) Stride of the output shifts vector in X dimension (in bytes)
1430 * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
1431 * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first element in the output shifts vector
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001432 */
1433__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
1434#if defined(A_OFFSET)
1435 ,
1436 IMAGE_DECLARATION(sum_col)
1437#endif // defined(A_OFFSET)
1438#if defined(B_OFFSET)
1439 ,
1440 IMAGE_DECLARATION(sum_row)
1441#endif // defined(B_OFFSET)
1442 ,
1443#if defined(ADD_BIAS)
1444 VECTOR_DECLARATION(biases),
1445#endif // defined(ADD_BIAS)
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001446 TENSOR3D_DECLARATION(dst)
1447#if defined(PER_CHANNEL_QUANTIZATION)
1448 ,
1449 VECTOR_DECLARATION(result_multipliers),
1450 VECTOR_DECLARATION(result_shifts)
1451#endif // defined(PER_CHANNEL_QUANTIZATION)
1452 )
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001453{
1454 const int x = get_global_id(0) * 4;
1455 const int y = get_global_id(1);
1456 const int z = get_global_id(2);
1457
1458 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1459
1460 // Compute offset contribution
1461 int4 offset_term_s32 = offset_contribution(
1462 x, y, z
1463#if defined(A_OFFSET)
1464 ,
1465 sum_col_ptr,
1466 sum_col_stride_x,
1467 sum_col_step_x,
1468 sum_col_stride_y,
1469 sum_col_step_y,
1470 sum_col_offset_first_element_in_bytes
1471#endif // defined(A_OFFSET)
1472#if defined(B_OFFSET)
1473 ,
1474 sum_row_ptr,
1475 sum_row_stride_x,
1476 sum_row_step_x,
1477 sum_row_stride_y,
1478 sum_row_step_y,
1479 sum_row_offset_first_element_in_bytes
1480#endif // defined(B_OFFSET)
1481#if defined(ADD_BIAS)
1482 ,
1483 biases_ptr,
1484 biases_stride_x,
1485 biases_step_x,
1486 biases_offset_first_element_in_bytes
1487#endif // defined(ADD_BIAS)
1488 );
1489
1490 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1491
1492 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
1493
1494 // Add the offset terms to GEMM's result
1495 in_s32 += offset_term_s32;
1496
1497 // -------------- OUTPUT STAGE
1498
1499 // Add the offset terms to GEMM's result
1500 in_s32 += (int4)RESULT_OFFSET;
1501
1502 // Multiply by result_mult_int and shift
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001503#if defined(PER_CHANNEL_QUANTIZATION)
1504 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
1505 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
1506 int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
1507 int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
1508
1509 in_s32 *= result_multipliers_values;
1510 in_s32 >>= result_shifts_values;
1511#else // defined(PER_CHANNEL_QUANTIZATION)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001512 in_s32 *= RESULT_MULTIPLIER;
1513
1514 in_s32 >>= RESULT_SHIFT;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001515#endif // defined(PER_CHANNEL_QUANTIZATION)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001516
1517 uchar4 res = convert_uchar4_sat(in_s32);
1518
1519#if defined(MIN_BOUND)
1520 res = max(res, (uchar4)MIN_BOUND);
1521#endif // defined(MIN_BOUND)
1522#if defined(MAX_BOUND)
1523 res = min(res, (uchar4)MAX_BOUND);
1524#endif // defined(MAX_BOUND)
1525
1526 // Store the result
1527 vstore4(res, 0, dst_addr);
1528}
1529
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001530/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001531 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001532 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001533 *
1534 *
1535 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1536 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1537 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
1538 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
1539 *
1540 * The result before the output stage is:
1541 *
1542 * mm_result[i][k] = mm_result[i][k] +
1543 * (sum_col[k] * A_OFFSET) +
1544 * (sum_row[i] * B_OFFSET) +
1545 * (K_OFFSET)
1546 *
1547 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
1548 *
1549 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1550 * -# Add bias to final result if bias tensor is not a nullptr
1551 * -# Round to nearest division by a power-of-two using result_shift
1552 * -# Add offset to each result
1553 * -# Clamp the value between the specified min and max bounds
1554 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1555 *
1556 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1557 *
1558 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1559 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1560 * These values can be used to implement "rectified linear unit" activation functions
1561 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001562 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
1563 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
1564 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
1565 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1566 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
1567 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
1568 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
1569 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1570 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1571 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1572 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1573 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1574 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1575 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1576 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1577 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
1578 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1579 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
1580 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1581 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
1582 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1583 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1584 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1585 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
1586 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1587 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1588 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1589 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1590 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1591 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1592 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1593 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1594 * @param[in] result_multipliers_ptr (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
1595 * @param[in] result_multipliers_stride_x (Optional) Stride of the output multipliers vector in X dimension (in bytes)
1596 * @param[in] result_multipliers_step_x (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
1597 * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
1598 * @param[in] result_shifts_ptr (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
1599 * @param[in] result_shifts_stride_x (Optional) Stride of the output shifts vector in X dimension (in bytes)
1600 * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
1601 * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first element in the output shifts vector
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001602 */
1603__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
1604#if defined(A_OFFSET)
1605 ,
1606 IMAGE_DECLARATION(sum_col)
1607#endif // defined(A_OFFSET)
1608#if defined(B_OFFSET)
1609 ,
1610 IMAGE_DECLARATION(sum_row)
1611#endif // defined(B_OFFSET)
1612 ,
1613#if defined(ADD_BIAS)
1614 VECTOR_DECLARATION(biases),
1615#endif // defined(ADD_BIAS)
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001616 TENSOR3D_DECLARATION(dst)
1617#if defined(PER_CHANNEL_QUANTIZATION)
1618 ,
1619 VECTOR_DECLARATION(result_multipliers),
1620 VECTOR_DECLARATION(result_shifts)
1621#endif // defined(PER_CHANNEL_QUANTIZATION)
1622 )
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001623{
1624 const int x = get_global_id(0) * 4;
1625 const int y = get_global_id(1);
1626 const int z = get_global_id(2);
1627
1628 // Compute offset contribution
1629 int4 offset_term_s32 = offset_contribution(
1630 x, y, z
1631#if defined(A_OFFSET)
1632 ,
1633 sum_col_ptr,
1634 sum_col_stride_x,
1635 sum_col_step_x,
1636 sum_col_stride_y,
1637 sum_col_step_y,
1638 sum_col_offset_first_element_in_bytes
1639#endif // defined(A_OFFSET)
1640#if defined(B_OFFSET)
1641 ,
1642 sum_row_ptr,
1643 sum_row_stride_x,
1644 sum_row_step_x,
1645 sum_row_stride_y,
1646 sum_row_step_y,
1647 sum_row_offset_first_element_in_bytes
1648#endif // defined(B_OFFSET)
1649#if defined(ADD_BIAS)
1650 ,
1651 biases_ptr,
1652 biases_stride_x,
1653 biases_step_x,
1654 biases_offset_first_element_in_bytes
1655#endif // defined(ADD_BIAS)
1656 );
1657
1658 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1659
1660 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1661
1662 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
1663
1664 // Add the offset terms to GEMM's result
1665 in_s32 += offset_term_s32;
1666
1667 // -------------- OUTPUT STAGE
1668
1669 // Multiply by result_mult_int and shift
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001670#if defined(PER_CHANNEL_QUANTIZATION)
1671 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
1672 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
1673 int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
1674 int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
1675
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001676 int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
1677 int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
1678 in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
1679#else // defined(PER_CHANNEL_QUANTIZATION)
1680
1681#if RESULT_SHIFT < 0
1682 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
1683#else // RESULT_SHIFT >= 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001684 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001685#endif // RESULT_SHIFT < 0
1686
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001687#endif // defined(PER_CHANNEL_QUANTIZATION)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001688
1689 // Add the offset terms to GEMM's result
1690 in_s32 += (int4)RESULT_OFFSET;
1691
1692 uchar4 res = convert_uchar4_sat(in_s32);
1693
1694#if defined(MIN_BOUND)
1695 res = max(res, (uchar4)MIN_BOUND);
1696#endif // defined(MIN_BOUND)
1697#if defined(MAX_BOUND)
1698 res = min(res, (uchar4)MAX_BOUND);
1699#endif // defined(MAX_BOUND)
1700
1701 // Store the result
1702 vstore4(res, 0, dst_addr);
1703}
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001704#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
1705
Gian Marco05288a22017-11-21 10:57:50 +00001706#endif // defined(K_OFFSET)
1707
1708#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1709/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1710 *
1711 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
1712 * The following computations will be performed by the kernel:
1713 *
1714 * -# Add offset terms to final result
1715 * -# Multiply each entry of result by result_mult_int
1716 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
1717 * -# Shift the int32 accumulator by result_shift
1718 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
1719 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1720 *
1721 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1722 *
1723 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1724 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1725 * These values can be used to implement "rectified linear unit" activation functions
1726 *
1727 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1728 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1729 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1730 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1731 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1732 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1733 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1734 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001735 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1736 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1737 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1738 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00001739 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1740 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1741 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1742 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1743 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1744 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1745 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1746 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1747 */
1748__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
1749#if defined(ADD_BIAS)
1750 VECTOR_DECLARATION(biases),
1751#endif // defined(ADD_BIAS)
1752 TENSOR3D_DECLARATION(dst))
1753{
1754 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001755 int x = get_global_id(0) * 4;
1756 int y = get_global_id(1);
1757 int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +00001758
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001759 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00001760
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001761 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1762
1763 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00001764
Gian Marco05288a22017-11-21 10:57:50 +00001765#if defined(ADD_BIAS)
1766 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001767 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
1768
1769 int4 biases_values = vload4(0, (__global int *)bias_addr);
1770 input_values += (int4)biases_values;
Gian Marco05288a22017-11-21 10:57:50 +00001771#endif // defined(ADD_BIAS)
1772
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001773 // Add the offset terms to GEMM's result
1774 input_values += (int4)RESULT_OFFSET;
1775
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00001776 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00001777 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00001778
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001779#if RESULT_SHIFT < 0
1780 input_values >>= -RESULT_SHIFT;
1781#else // RESULT_SHIFT >= 0
Gian Marco58c57942017-11-28 09:10:03 +00001782 input_values >>= RESULT_SHIFT;
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001783#endif // RESULT_SHIFT < 0
Gian Marco05288a22017-11-21 10:57:50 +00001784
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001785 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco05288a22017-11-21 10:57:50 +00001786
1787#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001788 res = max(res, (uchar4)MIN_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00001789#endif // defined(MIN_BOUND)
1790#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001791 res = min(res, (uchar4)MAX_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00001792#endif // defined(MAX_BOUND)
1793
1794 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001795 vstore4(res, 0, dst_addr);
Gian Marco05288a22017-11-21 10:57:50 +00001796}
Gian Marco58c57942017-11-28 09:10:03 +00001797#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1798
1799#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
1800/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1801 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001802 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8 value.
Gian Marco58c57942017-11-28 09:10:03 +00001803 * The following computations will be performed by the kernel:
1804 *
1805 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1806 * -# Add bias to final result if bias tensor is not a nullptr
1807 * -# Round to nearest division by a power-of-two using result_shift
1808 * -# Add offset to each result
1809 * -# Clamp the value between the specified min and max bounds
1810 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1811 *
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001812 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
Gian Marco58c57942017-11-28 09:10:03 +00001813 *
1814 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1815 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1816 * These values can be used to implement "rectified linear unit" activation functions
1817 *
1818 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1819 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1820 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1821 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1822 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1823 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1824 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1825 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001826 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1827 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1828 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1829 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco58c57942017-11-28 09:10:03 +00001830 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1831 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1832 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1833 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1834 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1835 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1836 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1837 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1838 */
1839__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
1840#if defined(ADD_BIAS)
1841 VECTOR_DECLARATION(biases),
1842#endif // defined(ADD_BIAS)
1843 TENSOR3D_DECLARATION(dst))
1844{
1845 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001846 int x = get_global_id(0) * 4;
1847 int y = get_global_id(1);
1848 int z = get_global_id(2);
Georgios Pinitas932491f2018-09-21 16:33:15 +01001849
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001850 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco58c57942017-11-28 09:10:03 +00001851
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001852 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1853
1854 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00001855
1856#if defined(ADD_BIAS)
1857 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001858 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
1859
1860 int4 biases_values = vload4(0, (__global int *)bias_addr);
1861 input_values += (int4)biases_values;
Gian Marco58c57942017-11-28 09:10:03 +00001862#endif // defined(ADD_BIAS)
1863
1864 // Multiply by result_mult_int and shift
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001865#if RESULT_SHIFT < 0
1866 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
1867#else // RESULT_SHIFT >= 0
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001868 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001869#endif // RESULT_SHIFT < 0
Gian Marco58c57942017-11-28 09:10:03 +00001870
1871 // Add the offset terms to GEMM's result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001872 input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
Gian Marco58c57942017-11-28 09:10:03 +00001873
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001874 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco58c57942017-11-28 09:10:03 +00001875
1876#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001877 res = max(res, (uchar4)MIN_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00001878#endif // defined(MIN_BOUND)
1879#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001880 res = min(res, (uchar4)MAX_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00001881#endif // defined(MAX_BOUND)
1882
1883 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001884 vstore4(res, 0, dst_addr);
Gian Marco58c57942017-11-28 09:10:03 +00001885}
Chunosov5124be52017-11-22 20:42:13 +07001886#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
Georgios Pinitas51e53a32018-10-22 13:49:08 +01001887
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001888#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
1889
Michalis Spyrou51146c52019-07-12 14:42:29 +01001890/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001891 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001892 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001893 * The following computations will be performed by the kernel:
1894 *
1895 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1896 * -# Add bias to final result if bias tensor is not a nullptr
1897 * -# Round to nearest division by a power-of-two using result_shift
1898 * -# Add offset to each result
1899 * -# Clamp the value between the specified min and max bounds
1900 * -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
1901 *
1902 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
1903 *
1904 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1905 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1906 * These values can be used to implement "rectified linear unit" activation functions
1907 *
1908 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1909 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1910 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1911 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1912 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1913 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1914 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1915 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1916 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
1917 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
1918 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
1919 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
1920 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1921 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1922 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1923 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1924 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1925 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1926 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1927 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1928 */
1929__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
1930#if defined(ADD_BIAS)
1931 VECTOR_DECLARATION(biases),
1932#endif // defined(ADD_BIAS)
1933 TENSOR3D_DECLARATION(dst))
1934{
1935 // Compute source and destination addresses
1936 int x = get_global_id(0) * 4;
1937 int y = get_global_id(1);
1938 int z = get_global_id(2);
1939
Michalis Spyrou51146c52019-07-12 14:42:29 +01001940 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001941
Michalis Spyrou51146c52019-07-12 14:42:29 +01001942 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001943
1944 int4 input_values = vload4(0, (__global int *)src_addr);
1945
1946#if defined(ADD_BIAS)
1947 // Add bias
Michalis Spyrou51146c52019-07-12 14:42:29 +01001948 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001949
1950 int4 biases_values = vload4(0, (__global int *)bias_addr);
1951 input_values += (int4)biases_values;
1952#endif // defined(ADD_BIAS)
1953
1954 // Multiply by result_mult_int and shift
Manuel Bottini07263982019-10-17 18:37:26 +01001955#if RESULT_SHIFT < 0
Michele Di Giorgio14cbfb22019-10-23 10:53:10 +01001956 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001957#else // RESULT_SHIFT >= 0
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001958 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Manuel Bottini07263982019-10-17 18:37:26 +01001959#endif // RESULT_SHIFT < 0
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001960
1961 short4 res = convert_short4_sat(input_values);
1962
1963#if defined(MIN_BOUND)
1964 res = max(res, (short4)MIN_BOUND);
1965#endif // defined(MIN_BOUND)
1966#if defined(MAX_BOUND)
1967 res = min(res, (short4)MAX_BOUND);
1968#endif // defined(MAX_BOUND)
1969
1970 // Store the result
Michalis Spyrou51146c52019-07-12 14:42:29 +01001971 vstore4(res, 0, (__global short *)dst_addr);
Manuel Bottini9c9b70b2019-07-01 17:35:56 +01001972}
1973#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
1974
Georgios Pinitas51e53a32018-10-22 13:49:08 +01001975#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
1976/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1977 *
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +00001978 * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8 value.
Georgios Pinitas51e53a32018-10-22 13:49:08 +01001979 * The following computations will be performed by the kernel:
1980 *
1981 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1982 * -# Add bias to final result if bias tensor is not a nullptr
1983 * -# Requantize
1984 * -# Add offset to each result
1985 * -# Clamp the value between the specified min and max bounds
1986 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1987 *
1988 * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
1989 *
1990 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1991 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1992 * These values can be used to implement "rectified linear unit" activation functions
1993 *
1994 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1995 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1996 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1997 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1998 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1999 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2000 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2001 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2002 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
2003 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
2004 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
2005 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
2006 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2007 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2008 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2009 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2010 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2011 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2012 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2013 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2014 * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
2015 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2016 */
2017__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
2018#if defined(ADD_BIAS)
2019 VECTOR_DECLARATION(biases),
2020#endif // defined(ADD_BIAS)
2021#if defined(DST_HEIGHT)
2022 TENSOR4D_DECLARATION(dst))
2023#else // defined(DST_HEIGHT)
2024 TENSOR3D_DECLARATION(dst))
2025#endif // defined(DST_HEIGHT)
2026{
2027 // Compute source and destination addresses
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002028 int x = get_global_id(0) * 4;
2029 int y = get_global_id(1);
2030 int z = get_global_id(2);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002031
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002032 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002033
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002034 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2035
2036 int4 input_values = vload4(0, (__global int *)src_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002037
2038#if defined(ADD_BIAS)
2039 // Add bias
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002040 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2041
2042 int4 biases_values = vload4(0, (__global int *)bias_addr);
2043 input_values += (int4)biases_values;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002044#endif // defined(ADD_BIAS)
2045
2046 // Convert to float
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002047 float16 input_values_f = convert_float4(input_values);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002048 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
2049
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002050 uchar4 res = convert_uchar4_sat(input_values_f);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002051
2052#if defined(MIN_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002053 res = max(res, (uchar4)MIN_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002054#endif // defined(MIN_BOUND)
2055#if defined(MAX_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002056 res = min(res, (uchar4)MAX_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002057#endif // defined(MAX_BOUND)
2058
2059 // Store the result
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002060 vstore4(res, 0, dst_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002061}
Gian Marco Iodicedb18a6f2019-05-30 09:53:10 +01002062#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)