blob: 00803697059a3a465192a8796d93482a23f0916f [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00002 * Copyright (c) 2017-2019 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010024#include "gemm_helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000026#include "repeat.h"
Gian Marco05288a22017-11-21 10:57:50 +000027
Georgios Pinitasdaa38552018-08-28 17:43:18 +010028#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
29#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010030#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010031#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010032#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010033#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
34#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenac50da382018-07-26 15:50:09 +010035
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010036#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
37
38/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. These macros use the dot8 instruction */
39#define ARM_DOT1(a, b, c) \
40 ({ \
41 ARM_DOT((uchar4)(a, (uchar3)0), (uchar4)(b, (uchar3)0), c); \
42 })
43#define ARM_DOT2(a, b, c) \
44 ({ \
45 ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
46 })
47#define ARM_DOT3(a, b, c) \
48 ({ \
49 ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
50 })
51#define ARM_DOT4(a, b, c) \
52 ({ \
53 ARM_DOT(a, b, c); \
54 })
55#define ARM_DOT8(a, b, c) \
56 ({ \
57 ARM_DOT4((a.lo), (b.lo), c); \
58 ARM_DOT4((a.hi), (b.hi), c); \
59 })
60#define ARM_DOT16(a, b, c) \
61 ({ \
62 ARM_DOT8((a.lo), (b.lo), c); \
63 ARM_DOT8((a.hi), (b.hi), c); \
64 })
65
66#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
67
68/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
69#define ARM_DOT1(a, b, c) \
70 ({ \
71 c += (uint)a.s0 * b.s0; \
72 })
73#define ARM_DOT2(a, b, c) \
74 ({ \
75 ARM_DOT1(a, b, c); \
76 c += (uint)a.s1 * b.s1; \
77 })
78#define ARM_DOT3(a, b, c) \
79 ({ \
80 ARM_DOT2(a, b, c); \
81 c += (uint)a.s2 * b.s2; \
82 })
83#define ARM_DOT4(a, b, c) \
84 ({ \
85 ARM_DOT3(a, b, c); \
86 c += (uint)a.s3 * b.s3; \
87 })
88#define ARM_DOT8(a, b, c) \
89 ({ \
90 ARM_DOT4((a.lo), (b.lo), c); \
91 ARM_DOT4((a.hi), (b.hi), c); \
92 })
93#define ARM_DOT16(a, b, c) \
94 ({ \
95 ARM_DOT8((a.lo), (b.lo), c); \
96 ARM_DOT8((a.hi), (b.hi), c); \
97 })
98#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
99
100/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
101#define ARM_DOT_K0X2(k0, a, b, c) \
102 ({ \
103 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
104 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
105 })
106#define ARM_DOT_K0X3(k0, a, b, c) \
107 ({ \
108 ARM_DOT_K0X2(k0, a, b, c); \
109 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
110 })
111#define ARM_DOT_K0X4(k0, a, b, c) \
112 ({ \
113 ARM_DOT_K0X3(k0, a, b, c); \
114 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
115 })
116#define ARM_DOT_K0X8(k0, a, b, c) \
117 ({ \
118 ARM_DOT_K0X4(k0, a, b, c); \
119 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
120 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
121 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
122 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
123 })
124#define ARM_DOT_K0X16(k0, a, b, c) \
125 ({ \
126 ARM_DOT_K0X8(k0, a, b, c); \
127 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
128 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
129 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
130 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
131 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
132 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
133 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
134 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
135 })
136
137/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0*/
138#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \
139 ({ \
140 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
141 })
142#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \
143 ({ \
144 ARM_MM_K0XN0X1(n0, k0, a, b, c); \
145 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
146 })
147#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \
148 ({ \
149 ARM_MM_K0XN0X2(n0, k0, a, b, c); \
150 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
151 })
152#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \
153 ({ \
154 ARM_MM_K0XN0X3(n0, k0, a, b, c); \
155 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
156 })
157#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \
158 ({ \
159 ARM_MM_K0XN0X4(n0, k0, a, b, c); \
160 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
161 })
162#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \
163 ({ \
164 ARM_MM_K0XN0X5(n0, k0, a, b, c); \
165 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
166 })
167#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \
168 ({ \
169 ARM_MM_K0XN0X6(n0, k0, a, b, c); \
170 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
171 })
172#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \
173 ({ \
174 ARM_MM_K0XN0X7(n0, k0, a, b, c); \
175 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
176 })
177
178#define ARM_DOT_K0(k0, a, b, c) \
179 ({ \
180 CONCAT(ARM_DOT, k0) \
181 ((a), (b), (c)); \
182 })
183
184#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
185 ({ \
186 CONCAT(ARM_DOT_K0X, n0) \
187 (k0, (a), b, (c)); \
188 })
189
190#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
191 ({ \
192 CONCAT(ARM_MM_K0XN0X, m0) \
193 (n0, k0, a, b, c); \
194 })
195
Gian Marco19835e52018-01-30 13:35:54 +0000196#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000197/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice5fc07aa2019-05-15 17:08:02 +0100198 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel before running the matrix multiplication
Gian Marco05288a22017-11-21 10:57:50 +0000199 *
Gian Marco19835e52018-01-30 13:35:54 +0000200 * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
201 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
202 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
Gian Marco05288a22017-11-21 10:57:50 +0000203 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100204 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
205 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
206 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
207 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
208 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
209 *
Gian Marco05288a22017-11-21 10:57:50 +0000210 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
211 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
212 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
213 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
214 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
215 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
216 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
217 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
218 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
219 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
220 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
221 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
222 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
223 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
224 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
225 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
226 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
227 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100228 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
229 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
230 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
231 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000232 */
Gian Marco19835e52018-01-30 13:35:54 +0000233__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
234 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100235 IMAGE_DECLARATION(dst),
236 uint src0_stride_z,
237 uint src1_stride_z,
238 uint dst_stride_z
239#if defined(REINTERPRET_OUTPUT_AS_3D)
240 ,
241 uint cross_plane_pad
242#endif // REINTERPRET_OUTPUT_AS_3D
243 )
Gian Marco05288a22017-11-21 10:57:50 +0000244{
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100245 const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
246 const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
247 const int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +0000248
Gian Marco19835e52018-01-30 13:35:54 +0000249 // Offset
250 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
251 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
252
253 // src_addr_a = address of matrix A
254 // src_addr_b = address of matrix B
Isabella Gottardib92805b2018-09-28 18:24:27 +0100255 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
Gian Marco19835e52018-01-30 13:35:54 +0000256 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco05288a22017-11-21 10:57:50 +0000257
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100258#if defined(MATRIX_B_DEPTH)
259 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
260 src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
261#else // defined(MATRIX_B_DEPTH)
262 src_addr_b += z * src1_stride_z;
263#endif // defined(MATRIX_B_DEPTH)
264
Gian Marco05288a22017-11-21 10:57:50 +0000265 // Compute end row address for matrix B
Gian Marco19835e52018-01-30 13:35:54 +0000266 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
267
268 src_addr_a += offset_row_a;
269 src_addr_b += offset_row_b;
Gian Marco05288a22017-11-21 10:57:50 +0000270
271 // Reset accumulators
Gian Marco19835e52018-01-30 13:35:54 +0000272 int4 c00 = 0;
273 int4 c10 = 0;
274 int4 c20 = 0;
275 int4 c30 = 0;
Gian Marco05288a22017-11-21 10:57:50 +0000276
Gian Marco19835e52018-01-30 13:35:54 +0000277 for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000278 {
279 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000280 int4 a0 = convert_int4(vload4(0, src_addr_a));
281 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000282
Gian Marco19835e52018-01-30 13:35:54 +0000283 c00 += (int4)a0.s0 * b0;
284 c10 += (int4)a0.s1 * b0;
285 c20 += (int4)a0.s2 * b0;
286 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000287
Gian Marco19835e52018-01-30 13:35:54 +0000288 a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
289 b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
Gian Marco05288a22017-11-21 10:57:50 +0000290
Gian Marco19835e52018-01-30 13:35:54 +0000291 c00 += (int4)a0.s0 * b0;
292 c10 += (int4)a0.s1 * b0;
293 c20 += (int4)a0.s2 * b0;
294 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000295 }
296
Gian Marco19835e52018-01-30 13:35:54 +0000297 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
Gian Marco05288a22017-11-21 10:57:50 +0000298 {
299 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000300 int4 a0 = convert_int4(vload4(0, src_addr_a));
301 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000302
Gian Marco19835e52018-01-30 13:35:54 +0000303 c00 += (int4)a0.s0 * b0;
304 c10 += (int4)a0.s1 * b0;
305 c20 += (int4)a0.s2 * b0;
306 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000307 }
308
309 // Compute destination address
310 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
311
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100312#if defined(REINTERPRET_OUTPUT_AS_3D)
313 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
314 // in order to take into account the presence of possible cross plane paddings
315 //
316 // | |
317 // | plane0 |
318 // | |
319 // |__________________|
320 // |******************|
321 // | cross_plane_pad |
322 // |******************|
323 // | |
324 // | plane1 |
325 // | |
326 // |__________________|
327
328 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
329 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
330 zout = min(DEPTH_GEMM3D - 1, zout);
331
332 // Add offset due to the cross plane paddings
333 zout *= (cross_plane_pad * dst_stride_y);
334
335 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
336 // multiply dst_stride_z by DEPTH_GEMM3D
337 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
338
Gian Marco19835e52018-01-30 13:35:54 +0000339 // Store 4x4 block
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100340 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
341 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
342 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
343 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
344
345#else // defined(REINTERPRET_OUTPUT_AS_3D)
346 // Add offset for batched GEMM
347 dst.ptr += z * dst_stride_z;
348
349 // Store 4x4 block
350 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
351 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
352 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
353 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
354#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000355}
Gian Marco19835e52018-01-30 13:35:54 +0000356#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000357
358#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
359#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
360#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
361#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
362/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
363 *
364 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
365 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100366 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
367 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
368 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
369 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
370 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
371 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
372 *
Gian Marco05288a22017-11-21 10:57:50 +0000373 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
374 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
375 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
376 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
377 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
378 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
379 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
380 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
381 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
382 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
383 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
384 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
385 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
386 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
387 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
388 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
389 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
390 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100391 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
392 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
393 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
394 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
395 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000396 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000397__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
398 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100399 IMAGE_DECLARATION(dst),
400 uint src0_stride_z,
401 uint src1_stride_z,
402 uint dst_stride_z
403#if defined(REINTERPRET_INPUT_AS_3D)
404 ,
405 uint src_cross_plane_pad
406#endif // REINTERPRET_INPUT_AS_3D
407#if defined(REINTERPRET_OUTPUT_AS_3D)
408 ,
409 uint dst_cross_plane_pad
410#endif // REINTERPRET_OUTPUT_AS_3D
411 )
Gian Marco05288a22017-11-21 10:57:50 +0000412{
413 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
414
415 // Compute starting address for matrix A and Matrix B
416 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
417
418 // Update address for the matrix A
419 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
420
421 // Update address for the matrix B
422 src_addr.s1 += idx;
423
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100424#if defined(REINTERPRET_INPUT_AS_3D)
425 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
426 // in order to take into account the presence of possible cross plane paddings
427 //
428 // | |
429 // | plane0 |
430 // | |
431 // |__________________|
432 // |******************|
433 // | cross_plane_pad |
434 // |******************|
435 // | |
436 // | plane1 |
437 // | |
438 // |__________________|
439
440 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
441 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
442 zin = min(DEPTH_GEMM3D - 1, zin);
443
444 // Add offset due to the cross plane paddings
445 zin *= (src_cross_plane_pad * src0_stride_y);
446
447 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
448 // multiply src0_stride_z by DEPTH_GEMM3D
449 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
450
451#else // defined(REINTERPRET_INPUT_AS_3D)
452
453 // Add offset for batched GEMM
454 src_addr.s0 += get_global_id(2) * src0_stride_z;
455
456#endif // defined(REINTERPRET_INPUT_AS_3D)
457
458#if defined(MATRIX_B_DEPTH)
459 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
460 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
461#else // defined(MATRIX_B_DEPTH)
462 src_addr.s1 += get_global_id(2) * src1_stride_z;
463#endif // defined(MATRIX_B_DEPTH)
464
Gian Marco05288a22017-11-21 10:57:50 +0000465 int end_row_vec_a = src_addr.s0 + COLS_A;
466
467 VECTOR_UINT acc0 = 0;
468#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
469 VECTOR_UINT acc1 = 0;
470#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
471#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
472 VECTOR_UINT acc2 = 0;
473#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
474#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
475 VECTOR_UINT acc3 = 0;
476#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000477#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
478 VECTOR_UINT acc4 = 0;
479#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000480
481 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
482 {
483 // Load values from matrix A
484 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
485#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
486 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
487#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
488#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
489 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
490#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
491#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
492 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
493#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000494#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
495 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
496#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000497 // Load values from matrix B
498 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
499 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
500
501 // Accumulate
502 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
503 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
504#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
505 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
506 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
507#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
508#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
509 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
510 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
511#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
512#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
513 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
514 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
515#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000516#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
517 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
518 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
519#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000520 }
521
522 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
523 {
524 // Load values from matrix A
525 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
526#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
527 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
528#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
529#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
530 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
531#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
532#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
533 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
534#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000535#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
536 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
537#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000538 // Load values from matrix B
539 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
540
541 // Accumulate
542 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
543#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
544 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
545#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
546#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
547 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
548#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
549#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
550 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
551#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000552#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
553 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
554#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000555 }
556
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100557 const int z = get_global_id(2);
558
Gian Marco05288a22017-11-21 10:57:50 +0000559 // Compute destination address
560 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
561
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100562#if defined(REINTERPRET_OUTPUT_AS_3D)
563 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
564 // in order to take into account the presence of possible cross plane paddings
565 //
566 // | |
567 // | plane0 |
568 // | |
569 // |__________________|
570 // |******************|
571 // | cross_plane_pad |
572 // |******************|
573 // | |
574 // | plane1 |
575 // | |
576 // |__________________|
577
578 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
579 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
580 zout = min(DEPTH_GEMM3D - 1, zout);
581
582 // Add offset due to the cross plane paddings
583 zout *= (dst_cross_plane_pad * dst_stride_y);
584
585 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
586 // multiply dst_stride_z by DEPTH_GEMM3D
587 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
588
Gian Marco05288a22017-11-21 10:57:50 +0000589 // Store the result
590 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100591 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco05288a22017-11-21 10:57:50 +0000592#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
593 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100594 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco05288a22017-11-21 10:57:50 +0000595#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
596#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
597 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100598 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco05288a22017-11-21 10:57:50 +0000599#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
600#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
601 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100602 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco05288a22017-11-21 10:57:50 +0000603#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000604#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
605 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100606 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +0000607#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100608
609#else // defined(REINTERPRET_OUTPUT_AS_3D)
610 // Add offset for batched GEMM
611 dst.ptr += z * dst_stride_z;
612
613 // Store the result
614 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
615 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
616#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
617 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
618 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
619#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
620#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
621 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
622 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
623#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
624#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
625 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
626 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
627#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
628#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
629 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
630 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
631#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
632#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +0000633}
634
635/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
636 *
637 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
638 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100639 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
640 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
641 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
642 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
643 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
644 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
645 *
Gian Marco7b4d5472018-01-10 15:56:30 +0000646 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
647 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
648 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
649 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
650 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
651 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
652 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
653 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
654 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
655 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
656 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
657 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
658 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
659 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
660 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
662 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
663 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100664 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
665 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
666 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
667 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
668 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +0000669 */
670__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
671 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100672 IMAGE_DECLARATION(dst),
673 uint src0_stride_z,
674 uint src1_stride_z,
675 uint dst_stride_z
676#if defined(REINTERPRET_INPUT_AS_3D)
677 ,
678 uint src_cross_plane_pad
679#endif // REINTERPRET_INPUT_AS_3D
680#if defined(REINTERPRET_OUTPUT_AS_3D)
681 ,
682 uint dst_cross_plane_pad
683#endif // REINTERPRET_OUTPUT_AS_3D
684 )
Gian Marco7b4d5472018-01-10 15:56:30 +0000685{
686 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
687
688 // Compute starting address for matrix A and Matrix B
689 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
690
691 // Update address for the matrix A
692 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
693
694 // Update address for the matrix B
695 src_addr.s1 += idx;
696
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100697#if defined(REINTERPRET_INPUT_AS_3D)
698 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
699 // in order to take into account the presence of possible cross plane paddings
700 //
701 // | |
702 // | plane0 |
703 // | |
704 // |__________________|
705 // |******************|
706 // | cross_plane_pad |
707 // |******************|
708 // | |
709 // | plane1 |
710 // | |
711 // |__________________|
712
713 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
714 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
715 zin = min(DEPTH_GEMM3D - 1, zin);
716
717 // Add offset due to the cross plane paddings
718 zin *= (src_cross_plane_pad * src0_stride_y);
719
720 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
721 // multiply src0_stride_z by DEPTH_GEMM3D
722 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
723
724#else // defined(REINTERPRET_INPUT_AS_3D)
725
726 // Add offset for batched GEMM
727 src_addr.s0 += get_global_id(2) * src0_stride_z;
728
729#endif // defined(REINTERPRET_INPUT_AS_3D)
730
731#if defined(MATRIX_B_DEPTH)
732 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
733 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
734#else // defined(MATRIX_B_DEPTH)
735 src_addr.s1 += get_global_id(2) * src1_stride_z;
736#endif // defined(MATRIX_B_DEPTH)
737
Gian Marco7b4d5472018-01-10 15:56:30 +0000738 int end_row_vec_a = src_addr.s0 + COLS_A;
739
740 uint acc00 = 0;
741 uint acc01 = 0;
742 uint acc02 = 0;
743 uint acc03 = 0;
744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
745 uint acc10 = 0;
746 uint acc11 = 0;
747 uint acc12 = 0;
748 uint acc13 = 0;
749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
750#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
751 uint acc20 = 0;
752 uint acc21 = 0;
753 uint acc22 = 0;
754 uint acc23 = 0;
755#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
756#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
757 uint acc30 = 0;
758 uint acc31 = 0;
759 uint acc32 = 0;
760 uint acc33 = 0;
761#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
762#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
763 uint acc40 = 0;
764 uint acc41 = 0;
765 uint acc42 = 0;
766 uint acc43 = 0;
767#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
768
769 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
770 {
771 // Load values from matrix A
772 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
773#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
774 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
775#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
776#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
777 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
778#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
779#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
780 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
781#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
782#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
783 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
784#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
785 // Load values from matrix B
786 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
787 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
788 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
789 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
790
791 {
792 // Accumulate
793 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
794 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
795 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
796 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
797
798 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
799 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
800 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
801 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
802
803 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
804 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
805 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
806 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
807
808 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
809 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
810 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
811 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
812
813 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
814 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
815 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
816 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
817 }
818#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
819 {
820 // Accumulate
821 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
822 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
823 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
824 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
825
826 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
827 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
828 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
829 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
830
831 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
832 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
833 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
834 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
835
836 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
837 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
838 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
839 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
840
841 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
842 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
843 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
844 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
845 }
846#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
847#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
848 {
849 // Accumulate
850 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
851 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
852 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
853 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
854
855 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
856 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
857 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
858 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
859
860 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
861 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
862 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
863 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
864
865 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
866 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
867 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
868 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
869
870 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
871 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
872 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
873 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
874 }
875#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
876#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
877 {
878 // Accumulate
879 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
880 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
881 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
882 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
883
884 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
885 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
886 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
887 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
888
889 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
890 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
891 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
892 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
893
894 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
895 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
896 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
897 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
898
899 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
900 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
901 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
902 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
903 }
904#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
905#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
906 {
907 // Accumulate
908 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
909 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
910 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
911 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
912
913 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
914 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
915 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
916 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
917
918 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
919 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
920 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
921 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
922
923 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
924 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
925 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
926 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
927
928 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
929 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
930 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
931 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
932 }
933#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
934 }
935
936 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
937 {
938 // Load values from matrix A
939 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
940#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
941 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
942#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
943#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
944 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
945#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
946#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
947 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
948#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
949#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
950 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
951#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
952 // Load values from matrix B
953 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
954
955 // Accumulate
956 {
957 // Accumulate
958 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
959 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
960 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
961 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
962
963 acc00 += ((uint)tmp0);
964 acc01 += ((uint)tmp1);
965 acc02 += ((uint)tmp2);
966 acc03 += ((uint)tmp3);
967 }
968#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
969 {
970 // Accumulate
971 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
972 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
973 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
974 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
975
976 acc10 += ((uint)tmp0);
977 acc11 += ((uint)tmp1);
978 acc12 += ((uint)tmp2);
979 acc13 += ((uint)tmp3);
980 }
981#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
982#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
983 {
984 // Accumulate
985 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
986 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
987 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
988 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
989
990 acc20 += ((uint)tmp0);
991 acc21 += ((uint)tmp1);
992 acc22 += ((uint)tmp2);
993 acc23 += ((uint)tmp3);
994 }
995#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
996#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
997 {
998 // Accumulate
999 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
1000 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
1001 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
1002 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
1003
1004 acc30 += ((uint)tmp0);
1005 acc31 += ((uint)tmp1);
1006 acc32 += ((uint)tmp2);
1007 acc33 += ((uint)tmp3);
1008 }
1009#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1010#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1011 {
1012 // Accumulate
1013 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
1014 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
1015 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
1016 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
1017
1018 acc40 += ((uint)tmp0);
1019 acc41 += ((uint)tmp1);
1020 acc42 += ((uint)tmp2);
1021 acc43 += ((uint)tmp3);
1022 }
1023#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1024 }
1025
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001026 const int z = get_global_id(2);
1027
Gian Marco7b4d5472018-01-10 15:56:30 +00001028 // Compute destination address
1029 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1030
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001031#if defined(REINTERPRET_OUTPUT_AS_3D)
1032 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1033 // in order to take into account the presence of possible cross plane paddings
1034 //
1035 // | |
1036 // | plane0 |
1037 // | |
1038 // |__________________|
1039 // |******************|
1040 // | cross_plane_pad |
1041 // |******************|
1042 // | |
1043 // | plane1 |
1044 // | |
1045 // |__________________|
1046
1047 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1048 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
1049 zout = min(DEPTH_GEMM3D - 1, zout);
1050
1051 // Add offset due to the cross plane paddings
1052 zout *= (dst_cross_plane_pad * dst_stride_y);
1053
1054 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1055 // multiply dst_stride_z by DEPTH_GEMM3D
1056 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
1057
Gian Marco7b4d5472018-01-10 15:56:30 +00001058 // Store the result
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001059 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco7b4d5472018-01-10 15:56:30 +00001060#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001061 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco7b4d5472018-01-10 15:56:30 +00001062#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1063#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001064 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco7b4d5472018-01-10 15:56:30 +00001065#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1066#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001067 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco7b4d5472018-01-10 15:56:30 +00001068#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1069#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001070 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +00001071#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001072
1073#else // defined(REINTERPRET_OUTPUT_AS_3D)
1074 // Add offset for batched GEMM
1075 dst.ptr += z * dst_stride_z;
1076
1077 // Store the result
1078 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
1079#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1080 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
1081#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1082#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1083 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
1084#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1085#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1086 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
1087#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1088#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1089 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
1090#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1091#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +00001092}
Giorgio Arena6200fa42018-07-06 17:06:36 +01001093
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001094#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001095/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1096 *
1097 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1098 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001099 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1100 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1101 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1102 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1103 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1104 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
1105 *
Giorgio Arena6200fa42018-07-06 17:06:36 +01001106 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
1107 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1108 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1109 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1110 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1111 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1112 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
1113 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1114 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1115 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1116 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1117 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1118 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
1119 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1120 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1121 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1122 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1123 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001124 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
1125 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1126 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1127 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
1128 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001129 */
1130__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
1131 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001132 IMAGE_DECLARATION(dst),
1133 uint src0_stride_z,
1134 uint src1_stride_z,
1135 uint dst_stride_z
1136#if defined(REINTERPRET_INPUT_AS_3D)
1137 ,
1138 uint src_cross_plane_pad
1139#endif // REINTERPRET_INPUT_AS_3D
1140#if defined(REINTERPRET_OUTPUT_AS_3D)
1141 ,
1142 uint dst_cross_plane_pad
1143#endif // REINTERPRET_OUTPUT_AS_3D)
1144 )
Giorgio Arena6200fa42018-07-06 17:06:36 +01001145{
1146 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1147
1148 // Compute starting address for matrix A and Matrix B
1149 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1150
1151 // Update address for the matrix A
1152 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1153
1154 // Update address for the matrix B
1155 src_addr.s1 += idx;
1156
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001157#if defined(REINTERPRET_INPUT_AS_3D)
1158 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
1159 // in order to take into account the presence of possible cross plane paddings
1160 //
1161 // | |
1162 // | plane0 |
1163 // | |
1164 // |__________________|
1165 // |******************|
1166 // | cross_plane_pad |
1167 // |******************|
1168 // | |
1169 // | plane1 |
1170 // | |
1171 // |__________________|
1172
1173 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1174 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
1175 zin = min(DEPTH_GEMM3D - 1, zin);
1176
1177 // Add offset due to the cross plane paddings
1178 zin *= (src_cross_plane_pad * src0_stride_y);
1179
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001180 zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;
1181
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001182 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1183 // multiply src0_stride_z by DEPTH_GEMM3D
1184 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
1185
1186#else // defined(REINTERPRET_INPUT_AS_3D)
1187
1188 // Add offset for batched GEMM
1189 src_addr.s0 += get_global_id(2) * src0_stride_z;
1190
1191#endif // defined(REINTERPRET_INPUT_AS_3D)
1192
1193#if defined(MATRIX_B_DEPTH)
1194 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1195 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
1196#else // defined(MATRIX_B_DEPTH)
1197 src_addr.s1 += get_global_id(2) * src1_stride_z;
1198#endif // defined(MATRIX_B_DEPTH)
1199
Giorgio Arena6200fa42018-07-06 17:06:36 +01001200 uint acc00 = 0;
1201 uint acc01 = 0;
1202 uint acc02 = 0;
1203 uint acc03 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001204 uint acc04 = 0;
1205 uint acc05 = 0;
1206 uint acc06 = 0;
1207 uint acc07 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001208#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1209 uint acc10 = 0;
1210 uint acc11 = 0;
1211 uint acc12 = 0;
1212 uint acc13 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001213 uint acc14 = 0;
1214 uint acc15 = 0;
1215 uint acc16 = 0;
1216 uint acc17 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001217#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1218#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1219 uint acc20 = 0;
1220 uint acc21 = 0;
1221 uint acc22 = 0;
1222 uint acc23 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001223 uint acc24 = 0;
1224 uint acc25 = 0;
1225 uint acc26 = 0;
1226 uint acc27 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001227#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1228#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1229 uint acc30 = 0;
1230 uint acc31 = 0;
1231 uint acc32 = 0;
1232 uint acc33 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001233 uint acc34 = 0;
1234 uint acc35 = 0;
1235 uint acc36 = 0;
1236 uint acc37 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001237#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001238
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001239 // A and B src indices get incremented at the same time.
1240 int i = 0;
1241 for(; i <= ((int)COLS_A - 8); i += 8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001242 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001243#if defined(REINTERPRET_INPUT_AS_3D)
1244 // Load values from matrix A and matrix B
1245 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001246#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001247 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001248#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1249#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001250 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001251#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1252#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001253 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001254#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001255#else // defined(REINTERPRET_INPUT_AS_3D)
1256 // Load values from matrix A and matrix B
1257 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1258#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1259 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1260#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1261#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1262 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1264#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1265 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1266#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1267#endif // defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001268
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001269 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1270 uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1271 uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1272 uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1273 src_addr.s1 += 4 * src1_stride_y;
1274
1275 ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1276 ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1277 ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1278 ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1279 ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1280 ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1281 ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1282 ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1283
Giorgio Arena6200fa42018-07-06 17:06:36 +01001284#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001285 ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1286 ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1287 ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1288 ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1289 ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1290 ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1291 ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1292 ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001293#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1294#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001295 ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1296 ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1297 ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1298 ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1299 ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1300 ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1301 ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1302 ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001303#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1304#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001305 ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1306 ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1307 ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1308 ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1309 ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1310 ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1311 ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1312 ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001313#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001314
1315 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1316 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1317 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1318 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1319 src_addr.s1 += 4 * src1_stride_y;
1320
1321 ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1322 ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1323 ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1324 ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1325 ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1326 ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1327 ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1328 ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1329
1330#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1331 ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1332 ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1333 ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1334 ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1335 ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1336 ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1337 ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1338 ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
1339#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1340#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1341 ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1342 ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1343 ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1344 ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1345 ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1346 ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1347 ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1348 ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
1349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1351 ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1352 ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1353 ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1354 ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1355 ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1356 ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1357 ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1358 ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
1359#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1360
1361 src_addr.s0 += 8;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001362 }
1363
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001364 for(; i < (int)COLS_A; ++i)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001365 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001366#if defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001367 // Load values from matrix A
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001368 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001369#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001370 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001371#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1372#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001373 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001374#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1375#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001376 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001377#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001378#else // defined(REINTERPRET_INPUT_AS_3D)
1379 // Load values from matrix A
1380 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1381#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1382 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1383#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1384#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1385 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1386#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1387#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1388 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1389#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1390#endif // defined(REINTERPRET_INPUT_AS_3D)
1391
Giorgio Arena6200fa42018-07-06 17:06:36 +01001392 // Load values from matrix B
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001393 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);
1394 src_addr.s1 += src1_stride_y;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001395
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001396 acc00 += (uint)a0 * b0.s0;
1397 acc01 += (uint)a0 * b0.s1;
1398 acc02 += (uint)a0 * b0.s2;
1399 acc03 += (uint)a0 * b0.s3;
1400 acc04 += (uint)a0 * b0.s4;
1401 acc05 += (uint)a0 * b0.s5;
1402 acc06 += (uint)a0 * b0.s6;
1403 acc07 += (uint)a0 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001404
Giorgio Arena6200fa42018-07-06 17:06:36 +01001405#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001406 acc10 += (uint)a1 * b0.s0;
1407 acc11 += (uint)a1 * b0.s1;
1408 acc12 += (uint)a1 * b0.s2;
1409 acc13 += (uint)a1 * b0.s3;
1410 acc14 += (uint)a1 * b0.s4;
1411 acc15 += (uint)a1 * b0.s5;
1412 acc16 += (uint)a1 * b0.s6;
1413 acc17 += (uint)a1 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001414#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1415#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001416 acc20 += (uint)a2 * b0.s0;
1417 acc21 += (uint)a2 * b0.s1;
1418 acc22 += (uint)a2 * b0.s2;
1419 acc23 += (uint)a2 * b0.s3;
1420 acc24 += (uint)a2 * b0.s4;
1421 acc25 += (uint)a2 * b0.s5;
1422 acc26 += (uint)a2 * b0.s6;
1423 acc27 += (uint)a2 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001424#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1425#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001426 acc30 += (uint)a3 * b0.s0;
1427 acc31 += (uint)a3 * b0.s1;
1428 acc32 += (uint)a3 * b0.s2;
1429 acc33 += (uint)a3 * b0.s3;
1430 acc34 += (uint)a3 * b0.s4;
1431 acc35 += (uint)a3 * b0.s5;
1432 acc36 += (uint)a3 * b0.s6;
1433 acc37 += (uint)a3 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001434#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001435
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001436 src_addr.s0 += 1;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001437 }
1438
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001439 int z = get_global_id(2);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001440
Giorgio Arena6200fa42018-07-06 17:06:36 +01001441 // Compute destination address
1442 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1443
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001444 // Compute dst address
1445 __global uchar *dst_addr = dst.ptr;
1446
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001447#if defined(REINTERPRET_OUTPUT_AS_3D)
1448 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1449 // in order to take into account the presence of possible cross plane paddings
1450 //
1451 // | |
1452 // | plane0 |
1453 // | |
1454 // |__________________|
1455 // |******************|
1456 // | cross_plane_pad |
1457 // |******************|
1458 // | |
1459 // | plane1 |
1460 // | |
1461 // |__________________|
1462
1463 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001464 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001465 zout = min(DEPTH_GEMM3D - 1, zout);
1466
1467 // Add offset due to the cross plane paddings
1468 zout *= (dst_cross_plane_pad * dst_stride_y);
1469
1470 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1471 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001472 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001473
Giorgio Arena6200fa42018-07-06 17:06:36 +01001474 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001475 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
1476 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001477#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001478 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
1479 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001480#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1481#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001482 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
1483 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001484#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1485#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001486 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
1487 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001488#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001489
1490#else // defined(REINTERPRET_OUTPUT_AS_3D)
1491 // Add offset for batched GEMM
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001492 dst_addr += z * dst_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001493
1494 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001495 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));
1496 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001497#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001498 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));
1499 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001500#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1501#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001502 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));
1503 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001504#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1505#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001506 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
1507 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001508#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001509#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1510}
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001511#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00001512#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
1513
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001514#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001515/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001516 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
1517 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
1518 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001519 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1520 * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001521 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
1522 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
1523 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
1524 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
1525 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
1526 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1527 * - M0 = 2, 3, 4, 5, 6, 7, 8
1528 * - N0 = 2, 3, 4, 8, 16
1529 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001530 * - V0 >= 1
1531 * - H0 >= 1
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001532 *
1533 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
1534 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1535 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1536 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1537 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
1538 *
1539 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
1540 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
1541 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1542 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
1543 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1544 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
1545 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1546 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1547 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1548 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1549 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1550 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
1551 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1552 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1553 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1554 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1555 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1556 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1557 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
1558 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
1559 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1560 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1561 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1562 */
1563__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
1564 IMAGE_DECLARATION(rhs),
1565 IMAGE_DECLARATION(dst),
1566 uint k,
1567 uint lhs_stride_z,
1568 uint rhs_stride_z,
1569 uint dst_stride_z
1570#if defined(REINTERPRET_OUTPUT_AS_3D)
1571 ,
1572 uint dst_cross_plane_pad
1573#endif // REINTERPRET_OUTPUT_AS_3D
1574 )
1575{
1576 // Block size
1577#define LHS_BLOCK_SIZE ((K0) * (M0))
1578
1579#if defined(LHS_INTERLEAVE)
1580#define LHS_OFFSET_X (K0)
1581#define LHS_STEP_X ((K0) * (V0))
1582#define LHS_STEP_LOOP (1)
1583#else // defined(INTERLEAVE)
1584#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
1585#define LHS_STEP_X (K0)
1586#define LHS_STEP_LOOP (V0)
1587#endif // defined(INTERLEAVE)
1588
1589 // Block size
1590#define RHS_BLOCK_SIZE ((K0) * (N0))
1591
1592 // RHS offset and step X
1593#if defined(RHS_INTERLEAVE)
1594#define RHS_OFFSET_X (K0)
1595#define RHS_STEP_X ((K0) * (H0))
1596#define RHS_STEP_LOOP (1)
1597#else // defined(RHS_INTERLEAVE)
1598#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1599#define RHS_STEP_X (K0)
1600#define RHS_STEP_LOOP (H0)
1601#endif // defined(RHS_INTERLEAVE)
1602
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001603 uint x = get_global_id(0);
1604 uint y = get_global_id(1);
1605 uint z = get_global_id(2);
1606
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001607#if defined(DUMMY_WORK_ITEMS)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001608 if((x * N0 >= N) || (y * M0 >= M))
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001609 {
1610 return;
1611 }
1612#endif // defined(DUMMY_WORK_ITEMS)
1613
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001614 // Compute LHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001615 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001616
1617 // Compute RHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001618 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001619
1620#if defined(MATRIX_B_DEPTH)
1621 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001622 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001623#else // defined(MATRIX_B_DEPTH)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001624 rhs_addr += z * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001625#endif // defined(MATRIX_B_DEPTH)
1626
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001627 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1628 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
1629
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001630 // Initialize the accumulators
1631 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1632
1633 for(int i = 0; i < k; i += K0)
1634 {
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001635 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001636 LOAD_BLOCK(M0, K0, uchar, a, lhs_addr, 0, LHS_STEP_X, zlhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001637
1638 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001639 LOAD_BLOCK(N0, K0, uchar, b, rhs_addr, 0, RHS_STEP_X, zrhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001640
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001641 // Partial matrix multiplication M0,N0,K0
1642 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001643
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001644 // Update address
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001645 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
1646 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
1647 }
1648
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001649 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001650
1651 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1652
1653#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001654 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1655 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001656
1657 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1658 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001659 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001660
1661#else // defined(REINTERPRET_OUTPUT_AS_3D)
1662
1663 // Add offset for batched GEMM
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001664 dst_addr += z * dst_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001665
1666#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1667
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001668 // Convert and store output block
1669 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001670
1671#undef LHS_BLOCK_SIZE
1672#undef LHS_OFFSET_X
1673#undef LHS_STEP_X
1674#undef RHS_BLOCK_SIZE
1675#undef RHS_OFFSET_X
1676#undef RHS_STEP_X
1677}
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001678#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
1679
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001680#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
1681
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001682/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1683 * The LHS matrix is NOT reshaped
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001684 * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001685 *
1686 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
1687 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
1688 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
1689 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
1690 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
1691 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1692 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1693 * - N0 = 2, 3, 4, 8, 16
1694 * - K0 = 2, 3, 4, 8, 16
1695 * - H0 >= 1
1696 *
1697 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1698 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1699 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1700 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1701 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1702 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1703 *
1704 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
1705 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
1706 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1707 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
1708 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1709 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
1710 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1711 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1712 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1713 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1714 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1715 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
1716 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1717 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1718 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1719 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1720 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1721 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1722 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
1723 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1724 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1725 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1726 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1727 */
1728__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1729 IMAGE_DECLARATION(rhs),
1730 IMAGE_DECLARATION(dst),
1731 uint lhs_stride_z,
1732 uint rhs_stride_z,
1733 uint dst_stride_z
1734#if defined(REINTERPRET_INPUT_AS_3D)
1735 ,
1736 uint lhs_cross_plane_pad
1737#endif // REINTERPRET_INPUT_AS_3D
1738#if defined(REINTERPRET_OUTPUT_AS_3D)
1739 ,
1740 uint dst_cross_plane_pad
1741#endif // REINTERPRET_OUTPUT_AS_3D
1742 )
1743{
1744 // Block size
1745#define RHS_BLOCK_SIZE ((K0) * (N0))
1746
1747 // RHS offset and step X
1748#if defined(RHS_INTERLEAVE)
1749#define RHS_OFFSET_X (K0)
1750#define RHS_STEP_X ((K0) * (H0))
1751#define RHS_STEP_LOOP (1)
1752#else // defined(RHS_INTERLEAVE)
1753#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1754#define RHS_STEP_X (K0)
1755#define RHS_STEP_LOOP (H0)
1756#endif // defined(RHS_INTERLEAVE)
1757
1758 uint x = get_global_id(0);
1759 uint y = get_global_id(1);
1760 uint z = get_global_id(2);
1761
Gian Marco Iodice86cfffe2019-04-02 11:02:20 +01001762#if defined(DUMMY_WORK_ITEMS)
1763 if((x * N0 >= N) || (y * M0 >= M))
1764 {
1765 return;
1766 }
1767#endif // defined(DUMMY_WORK_ITEMS)
1768
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001769 // Compute LHS matrix address
1770 uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
1771
1772 // Compute RHS matrix address
1773 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
1774
1775#if defined(MATRIX_B_DEPTH)
1776 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1777 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1778#else // defined(MATRIX_B_DEPTH)
1779 rhs_offset += z * rhs_stride_z;
1780#endif // defined(MATRIX_B_DEPTH)
1781
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001782 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1783 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001784
1785#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001786 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1787 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001788
1789 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1790 // multiply lhs_stride_z by DEPTH_GEMM3D
1791 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1792
1793#else // defined(REINTERPRET_INPUT_AS_3D)
1794
1795 // Add offset for batched GEMM
1796 lhs_offset += z * lhs_stride_z;
1797
1798#endif // defined(REINTERPRET_INPUT_AS_3D)
1799
1800 // Initialize the accumulators
1801 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1802
1803 for(int i = 0; i < K; i += K0)
1804 {
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001805 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001806 LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001807
1808 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001809 LOAD_BLOCK(N0, K0, uchar, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001810
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001811 // Partial matrix multiplication M0,N0,K0
1812 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001813
1814 lhs_offset += K0;
1815 rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1816 }
1817
1818 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
1819
1820 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1821
1822#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001823 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001824 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001825
1826 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1827 // multiply dst_stride_z by DEPTH_GEMM3D
1828 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1829
1830#else // defined(REINTERPRET_OUTPUT_AS_3D)
1831
1832 // Add offset for batched GEMM
1833 dst_addr += z * dst_stride_z;
1834
1835#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1836
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001837 // Convert and store output block
1838 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001839
1840#undef RHS_BLOCK_SIZE
1841#undef RHS_OFFSET_X
1842#undef RHS_STEP_X
1843}
1844#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
1845
Gian Marco05288a22017-11-21 10:57:50 +00001846#if defined(COLS_A)
1847/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
1848 *
1849 * @note This stage is needed to handle the offset of matrix product
1850 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1851 *
1852 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1853 *
1854 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1855 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1856 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1857 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1858 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1859 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1860 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1861 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1862 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1863 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1864 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1865 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1866 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1867 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1868 */
1869__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
1870 IMAGE_DECLARATION(dst))
1871{
1872 // Compute source and destination addresses
1873 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1874 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1875
1876 uint4 sum_row_u32 = (uint4)0;
1877 uint sum_row = 0;
1878
1879 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1880
1881 int i = 0;
1882
1883 // This for loop performs 16 accumulations
1884 for(; i <= ((int)COLS_A - 16); i += 16)
1885 {
1886 const uchar16 a0_u8 = vload16(0, matrix_a + i);
1887
1888 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
1889 }
1890
1891 // This for loop performs the leftover accumulations
1892 for(; i < COLS_A; ++i)
1893 {
1894 sum_row += matrix_a[i];
1895 }
1896
1897 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
1898
1899 *((__global int *)dst.ptr) = (int)sum_row;
1900}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001901
1902#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1903/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
1904 *
1905 * @note This stage is needed to handle the offset of matrix product
1906 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1907 *
1908 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1909 *
1910 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1911 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1912 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1913 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1914 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1915 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1916 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1917 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1918 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1919 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1920 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1921 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1922 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1923 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1924 */
1925__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
1926 IMAGE_DECLARATION(dst))
1927{
1928 // Compute source and destination addresses
1929 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1930 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1931
1932 uint sum_row = 0;
1933
1934 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1935
1936 int i = 0;
1937
1938 // This for loop performs 16 accumulations
1939 for(; i <= ((int)COLS_A - 32); i += 32)
1940 {
1941 uchar16 a0_u8 = vload16(0, matrix_a + i);
1942
1943 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
1944 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
1945 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
1946 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
1947
1948 a0_u8 = vload16(1, matrix_a + i);
1949
1950 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
1951 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
1952 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
1953 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
1954 }
1955
1956 // This for loop performs the leftover accumulations
1957 for(; i < COLS_A; ++i)
1958 {
1959 sum_row += matrix_a[i];
1960 }
1961
1962 *((__global int *)dst.ptr) = (int)sum_row;
1963}
1964#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00001965#endif // defined(COLS_A)
1966
1967#if defined(COLS_B) && defined(ROWS_B)
1968/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
1969 *
1970 * @note This stage is needed to handle the offset of matrix product
1971 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1972 *
1973 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
1974 *
1975 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1976 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1977 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1978 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1979 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1980 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1981 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1982 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1983 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1984 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1985 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1986 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1987 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1988 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1989 */
1990__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
1991 IMAGE_DECLARATION(dst))
1992{
1993 // Compute source and destination addresses
1994 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1995 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1996
1997 uint16 sum_col_u32 = (uint16)0;
1998
1999 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
2000
2001 int i = 0;
2002 // This for loop performs 4 accumulations
2003 for(; i <= ((int)ROWS_B - 4); i += 4)
2004 {
2005 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
2006 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
2007 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
2008 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
2009
2010 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
2011
2012 matrix_b += 4 * src_stride_y;
2013 }
2014
2015 // This for loop perfoms the leftover accumulations
2016 for(; i < (int)ROWS_B; ++i)
2017 {
2018 const uchar16 b0_u8 = vload16(0, matrix_b);
2019
2020 sum_col_u32 += convert_uint16(b0_u8);
2021
2022 matrix_b += src_stride_y;
2023 }
2024
2025 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
2026}
2027#endif // defined(COLS_B) && defined(ROWS_B)
2028
2029#if defined(K_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002030
2031/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.
2032 *
2033 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2034 * and calculates the offset contribution of matrix A and matrix B.
2035 *
2036 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2037 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2038 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2039 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2040 *
2041 * @param[in] x get_global_id(0) * 4
2042 * @param[in] y get_global_id(1)
2043 * @param[in] z get_global_id(2)
2044 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2045 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2046 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2047 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2048 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2049 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2050 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2051 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2052 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2053 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2054 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2055 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2056 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2057 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2058 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2059 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2060 */
2061inline int4 offset_contribution(
2062 int x,
2063 int y,
2064 int z
2065#if defined(A_OFFSET)
2066 ,
2067 IMAGE_DECLARATION(sum_col)
2068#endif // defined(A_OFFSET)
2069#if defined(B_OFFSET)
2070 ,
2071 IMAGE_DECLARATION(sum_row)
2072#endif // defined(B_OFFSET)
2073#if defined(ADD_BIAS)
2074 ,
2075 VECTOR_DECLARATION(biases)
2076#endif // defined(ADD_BIAS)
2077)
2078{
2079 int4 a_offset_s32 = (int4)0;
2080 int4 b_offset_s32 = (int4)0;
2081
2082 int batch_id = z;
2083#if defined(DEPTH_INPUT3D)
2084 batch_id /= (int)DEPTH_INPUT3D;
2085#endif // defined(DEPTH_INPUT3D)
2086
2087#if defined(A_OFFSET)
2088 // Compute the offset contribution due to A_OFFSET
2089 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
2090
2091 // Compute the offset contribution due to A_OFFSET
2092#if defined(SUM_COL_HAS_BATCHES)
2093 a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
2094#else // defined(SUM_COL_HAS_BATCHES)
2095 a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
2096#endif // defined(SUM_COL_HAS_BATCHES)
2097
2098 a_offset_s32 *= (int4)A_OFFSET;
2099#endif // defined(A_OFFSET)
2100
2101#if defined(B_OFFSET)
2102 // Compute the offset contribution due to A_OFFSET
2103 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
2104
2105 // Compute the offset contribution due to B_OFFSET
2106#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2107 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
2108#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2109 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
2110#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2111 b_offset_s32 *= (int4)B_OFFSET;
2112#endif // defined(B_OFFSET)
2113
2114#if defined(ADD_BIAS)
2115 // Add bias
2116 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2117
2118 int4 biases_values = vload4(0, (__global int *)bias_addr);
2119 b_offset_s32 += (int4)biases_values;
2120#endif // defined(ADD_BIAS)
2121
2122 return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
2123}
2124
Gian Marco05288a22017-11-21 10:57:50 +00002125/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
2126 *
2127 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2128 * and adds to it the offset contribution of matrix A and matrix B in-place.
2129 *
2130 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2131 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2132 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07002133 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00002134 *
2135 * The final result is:
2136 *
2137 * mm_result[i][k] = mm_result[i][k] +
2138 * (sum_col[k] * A_OFFSET) +
2139 * (sum_row[i] * B_OFFSET) +
2140 * (K_OFFSET)
2141 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002142 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2143 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2144 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2145 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2146 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2147 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2148 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2149 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002150 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2151 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2152 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2153 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2154 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2155 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2156 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2157 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2158 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2159 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2160 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2161 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2162 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2163 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2164 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2165 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002166 */
2167__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
2168#if defined(A_OFFSET)
2169 ,
2170 IMAGE_DECLARATION(sum_col)
2171#endif // defined(A_OFFSET)
2172#if defined(B_OFFSET)
2173 ,
2174 IMAGE_DECLARATION(sum_row)
2175#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002176#if defined(ADD_BIAS)
2177 ,
2178 VECTOR_DECLARATION(biases)
2179#endif // defined(ADD_BIAS))
Gian Marco05288a22017-11-21 10:57:50 +00002180 )
2181{
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002182 const int x = get_global_id(0) * 4;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002183 const int y = get_global_id(1);
2184 const int z = get_global_id(2);
2185
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002186 // Compute offset contribution
2187 int4 offset_term_s32 = offset_contribution(
2188 x, y, z
Gian Marco05288a22017-11-21 10:57:50 +00002189#if defined(A_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002190 ,
2191 sum_col_ptr,
2192 sum_col_stride_x,
2193 sum_col_step_x,
2194 sum_col_stride_y,
2195 sum_col_step_y,
2196 sum_col_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002197#endif // defined(A_OFFSET)
Gian Marco05288a22017-11-21 10:57:50 +00002198#if defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002199 ,
2200 sum_row_ptr,
2201 sum_row_stride_x,
2202 sum_row_step_x,
2203 sum_row_stride_y,
2204 sum_row_step_y,
2205 sum_row_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002206#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002207#if defined(ADD_BIAS)
2208 ,
2209 biases_ptr,
2210 biases_stride_x,
2211 biases_step_x,
2212 biases_offset_first_element_in_bytes
2213#endif // defined(ADD_BIAS)
2214 );
Gian Marco05288a22017-11-21 10:57:50 +00002215
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002216 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002217
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002218 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002219
2220 // Add the offset terms to GEMM's result
2221 in_s32 += offset_term_s32;
2222
2223 // Store the result with the offset contribution
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002224 vstore4(in_s32, 0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002225}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002226
2227#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
2228/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2229 *
2230 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2231 *
2232 *
2233 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2234 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2235 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2236 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2237 *
2238 * The result before the output stage is:
2239 *
2240 * mm_result[i][k] = mm_result[i][k] +
2241 * (sum_col[k] * A_OFFSET) +
2242 * (sum_row[i] * B_OFFSET) +
2243 * (K_OFFSET)
2244 *
2245 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2246 *
2247 * -# Add offset terms to final result
2248 * -# Multiply each entry of result by result_mult_int
2249 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2250 * -# Shift the int32 accumulator by result_shift
2251 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2252 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2253 *
2254 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2255 *
2256 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2257 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2258 * These values can be used to implement "rectified linear unit" activation functions
2259 *
2260 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2261 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2262 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2263 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2264 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2265 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2266 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2267 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2268 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2269 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2270 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2271 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2272 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2273 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2274 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2275 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2276 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2277 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2278 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2279 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2280 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2281 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2282 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2283 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2284 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2285 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2286 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2287 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2288 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2289 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2290 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2291 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2292 */
2293__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
2294#if defined(A_OFFSET)
2295 ,
2296 IMAGE_DECLARATION(sum_col)
2297#endif // defined(A_OFFSET)
2298#if defined(B_OFFSET)
2299 ,
2300 IMAGE_DECLARATION(sum_row)
2301#endif // defined(B_OFFSET)
2302 ,
2303#if defined(ADD_BIAS)
2304 VECTOR_DECLARATION(biases),
2305#endif // defined(ADD_BIAS)
2306 TENSOR3D_DECLARATION(dst))
2307{
2308 const int x = get_global_id(0) * 4;
2309 const int y = get_global_id(1);
2310 const int z = get_global_id(2);
2311
2312 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2313
2314 // Compute offset contribution
2315 int4 offset_term_s32 = offset_contribution(
2316 x, y, z
2317#if defined(A_OFFSET)
2318 ,
2319 sum_col_ptr,
2320 sum_col_stride_x,
2321 sum_col_step_x,
2322 sum_col_stride_y,
2323 sum_col_step_y,
2324 sum_col_offset_first_element_in_bytes
2325#endif // defined(A_OFFSET)
2326#if defined(B_OFFSET)
2327 ,
2328 sum_row_ptr,
2329 sum_row_stride_x,
2330 sum_row_step_x,
2331 sum_row_stride_y,
2332 sum_row_step_y,
2333 sum_row_offset_first_element_in_bytes
2334#endif // defined(B_OFFSET)
2335#if defined(ADD_BIAS)
2336 ,
2337 biases_ptr,
2338 biases_stride_x,
2339 biases_step_x,
2340 biases_offset_first_element_in_bytes
2341#endif // defined(ADD_BIAS)
2342 );
2343
2344 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2345
2346 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2347
2348 // Add the offset terms to GEMM's result
2349 in_s32 += offset_term_s32;
2350
2351 // -------------- OUTPUT STAGE
2352
2353 // Add the offset terms to GEMM's result
2354 in_s32 += (int4)RESULT_OFFSET;
2355
2356 // Multiply by result_mult_int and shift
2357 in_s32 *= RESULT_MULTIPLIER;
2358
2359 in_s32 >>= RESULT_SHIFT;
2360
2361 uchar4 res = convert_uchar4_sat(in_s32);
2362
2363#if defined(MIN_BOUND)
2364 res = max(res, (uchar4)MIN_BOUND);
2365#endif // defined(MIN_BOUND)
2366#if defined(MAX_BOUND)
2367 res = min(res, (uchar4)MAX_BOUND);
2368#endif // defined(MAX_BOUND)
2369
2370 // Store the result
2371 vstore4(res, 0, dst_addr);
2372}
2373
2374/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2375 *
2376 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2377 *
2378 *
2379 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2380 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2381 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2382 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2383 *
2384 * The result before the output stage is:
2385 *
2386 * mm_result[i][k] = mm_result[i][k] +
2387 * (sum_col[k] * A_OFFSET) +
2388 * (sum_row[i] * B_OFFSET) +
2389 * (K_OFFSET)
2390 *
2391 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2392 *
2393 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2394 * -# Add bias to final result if bias tensor is not a nullptr
2395 * -# Round to nearest division by a power-of-two using result_shift
2396 * -# Add offset to each result
2397 * -# Clamp the value between the specified min and max bounds
2398 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2399 *
2400 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2401 *
2402 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2403 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2404 * These values can be used to implement "rectified linear unit" activation functions
2405 *
2406 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2407 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2408 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2409 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2410 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2411 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2412 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2413 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2414 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2415 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2416 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2417 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2418 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2419 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2420 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2421 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2422 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2423 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2424 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2425 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2426 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2427 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2428 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2429 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2430 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2431 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2432 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2433 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2434 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2435 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2436 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2437 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2438 */
2439__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
2440#if defined(A_OFFSET)
2441 ,
2442 IMAGE_DECLARATION(sum_col)
2443#endif // defined(A_OFFSET)
2444#if defined(B_OFFSET)
2445 ,
2446 IMAGE_DECLARATION(sum_row)
2447#endif // defined(B_OFFSET)
2448 ,
2449#if defined(ADD_BIAS)
2450 VECTOR_DECLARATION(biases),
2451#endif // defined(ADD_BIAS)
2452 TENSOR3D_DECLARATION(dst))
2453{
2454 const int x = get_global_id(0) * 4;
2455 const int y = get_global_id(1);
2456 const int z = get_global_id(2);
2457
2458 // Compute offset contribution
2459 int4 offset_term_s32 = offset_contribution(
2460 x, y, z
2461#if defined(A_OFFSET)
2462 ,
2463 sum_col_ptr,
2464 sum_col_stride_x,
2465 sum_col_step_x,
2466 sum_col_stride_y,
2467 sum_col_step_y,
2468 sum_col_offset_first_element_in_bytes
2469#endif // defined(A_OFFSET)
2470#if defined(B_OFFSET)
2471 ,
2472 sum_row_ptr,
2473 sum_row_stride_x,
2474 sum_row_step_x,
2475 sum_row_stride_y,
2476 sum_row_step_y,
2477 sum_row_offset_first_element_in_bytes
2478#endif // defined(B_OFFSET)
2479#if defined(ADD_BIAS)
2480 ,
2481 biases_ptr,
2482 biases_stride_x,
2483 biases_step_x,
2484 biases_offset_first_element_in_bytes
2485#endif // defined(ADD_BIAS)
2486 );
2487
2488 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2489
2490 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2491
2492 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2493
2494 // Add the offset terms to GEMM's result
2495 in_s32 += offset_term_s32;
2496
2497 // -------------- OUTPUT STAGE
2498
2499 // Multiply by result_mult_int and shift
2500 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
2501
2502 // Add the offset terms to GEMM's result
2503 in_s32 += (int4)RESULT_OFFSET;
2504
2505 uchar4 res = convert_uchar4_sat(in_s32);
2506
2507#if defined(MIN_BOUND)
2508 res = max(res, (uchar4)MIN_BOUND);
2509#endif // defined(MIN_BOUND)
2510#if defined(MAX_BOUND)
2511 res = min(res, (uchar4)MAX_BOUND);
2512#endif // defined(MAX_BOUND)
2513
2514 // Store the result
2515 vstore4(res, 0, dst_addr);
2516}
2517#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
Gian Marco05288a22017-11-21 10:57:50 +00002518#endif // defined(K_OFFSET)
2519
2520#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2521/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2522 *
2523 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
2524 * The following computations will be performed by the kernel:
2525 *
2526 * -# Add offset terms to final result
2527 * -# Multiply each entry of result by result_mult_int
2528 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2529 * -# Shift the int32 accumulator by result_shift
2530 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2531 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2532 *
2533 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2534 *
2535 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2536 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2537 * These values can be used to implement "rectified linear unit" activation functions
2538 *
2539 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2540 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2541 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2542 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2543 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2544 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2545 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2546 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002547 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2548 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2549 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2550 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002551 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2552 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2553 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2554 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2555 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2556 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2557 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2559 */
2560__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
2561#if defined(ADD_BIAS)
2562 VECTOR_DECLARATION(biases),
2563#endif // defined(ADD_BIAS)
2564 TENSOR3D_DECLARATION(dst))
2565{
2566 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002567 int x = get_global_id(0) * 4;
2568 int y = get_global_id(1);
2569 int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +00002570
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002571 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002572
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002573 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2574
2575 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002576
Gian Marco05288a22017-11-21 10:57:50 +00002577#if defined(ADD_BIAS)
2578 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002579 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2580
2581 int4 biases_values = vload4(0, (__global int *)bias_addr);
2582 input_values += (int4)biases_values;
Gian Marco05288a22017-11-21 10:57:50 +00002583#endif // defined(ADD_BIAS)
2584
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002585 // Add the offset terms to GEMM's result
2586 input_values += (int4)RESULT_OFFSET;
2587
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00002588 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00002589 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00002590
Gian Marco58c57942017-11-28 09:10:03 +00002591 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +00002592
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002593 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco05288a22017-11-21 10:57:50 +00002594
2595#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002596 res = max(res, (uchar4)MIN_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002597#endif // defined(MIN_BOUND)
2598#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002599 res = min(res, (uchar4)MAX_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002600#endif // defined(MAX_BOUND)
2601
2602 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002603 vstore4(res, 0, dst_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002604}
Gian Marco58c57942017-11-28 09:10:03 +00002605#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2606
2607#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
2608/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2609 *
2610 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2611 * The following computations will be performed by the kernel:
2612 *
2613 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2614 * -# Add bias to final result if bias tensor is not a nullptr
2615 * -# Round to nearest division by a power-of-two using result_shift
2616 * -# Add offset to each result
2617 * -# Clamp the value between the specified min and max bounds
2618 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2619 *
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002620 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
Gian Marco58c57942017-11-28 09:10:03 +00002621 *
2622 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2623 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2624 * These values can be used to implement "rectified linear unit" activation functions
2625 *
2626 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2627 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2628 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2629 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2630 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2631 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2632 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2633 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002634 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2635 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2636 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2637 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco58c57942017-11-28 09:10:03 +00002638 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2639 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2640 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2641 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2642 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2643 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2644 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2645 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2646 */
2647__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
2648#if defined(ADD_BIAS)
2649 VECTOR_DECLARATION(biases),
2650#endif // defined(ADD_BIAS)
2651 TENSOR3D_DECLARATION(dst))
2652{
2653 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002654 int x = get_global_id(0) * 4;
2655 int y = get_global_id(1);
2656 int z = get_global_id(2);
Georgios Pinitas932491f2018-09-21 16:33:15 +01002657
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002658 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco58c57942017-11-28 09:10:03 +00002659
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002660 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2661
2662 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002663
2664#if defined(ADD_BIAS)
2665 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002666 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2667
2668 int4 biases_values = vload4(0, (__global int *)bias_addr);
2669 input_values += (int4)biases_values;
Gian Marco58c57942017-11-28 09:10:03 +00002670#endif // defined(ADD_BIAS)
2671
2672 // Multiply by result_mult_int and shift
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002673 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Gian Marco58c57942017-11-28 09:10:03 +00002674
2675 // Add the offset terms to GEMM's result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002676 input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
Gian Marco58c57942017-11-28 09:10:03 +00002677
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002678 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco58c57942017-11-28 09:10:03 +00002679
2680#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002681 res = max(res, (uchar4)MIN_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002682#endif // defined(MIN_BOUND)
2683#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002684 res = min(res, (uchar4)MAX_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002685#endif // defined(MAX_BOUND)
2686
2687 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002688 vstore4(res, 0, dst_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002689}
Chunosov5124be52017-11-22 20:42:13 +07002690#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002691
2692#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
2693/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2694 *
2695 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2696 * The following computations will be performed by the kernel:
2697 *
2698 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2699 * -# Add bias to final result if bias tensor is not a nullptr
2700 * -# Requantize
2701 * -# Add offset to each result
2702 * -# Clamp the value between the specified min and max bounds
2703 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2704 *
2705 * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
2706 *
2707 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2708 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2709 * These values can be used to implement "rectified linear unit" activation functions
2710 *
2711 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2712 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2713 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2714 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2715 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2716 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2717 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2718 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2719 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
2720 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
2721 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
2722 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
2723 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2724 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2725 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2726 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2727 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2728 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2729 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2730 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2731 * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
2732 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2733 */
2734__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
2735#if defined(ADD_BIAS)
2736 VECTOR_DECLARATION(biases),
2737#endif // defined(ADD_BIAS)
2738#if defined(DST_HEIGHT)
2739 TENSOR4D_DECLARATION(dst))
2740#else // defined(DST_HEIGHT)
2741 TENSOR3D_DECLARATION(dst))
2742#endif // defined(DST_HEIGHT)
2743{
2744 // Compute source and destination addresses
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002745 int x = get_global_id(0) * 4;
2746 int y = get_global_id(1);
2747 int z = get_global_id(2);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002748
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002749 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002750
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002751 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2752
2753 int4 input_values = vload4(0, (__global int *)src_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002754
2755#if defined(ADD_BIAS)
2756 // Add bias
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002757 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2758
2759 int4 biases_values = vload4(0, (__global int *)bias_addr);
2760 input_values += (int4)biases_values;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002761#endif // defined(ADD_BIAS)
2762
2763 // Convert to float
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002764 float16 input_values_f = convert_float4(input_values);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002765 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
2766
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002767 uchar4 res = convert_uchar4_sat(input_values_f);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002768
2769#if defined(MIN_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002770 res = max(res, (uchar4)MIN_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002771#endif // defined(MIN_BOUND)
2772#if defined(MAX_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002773 res = min(res, (uchar4)MAX_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002774#endif // defined(MAX_BOUND)
2775
2776 // Store the result
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002777 vstore4(res, 0, dst_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002778}
Gian Marco Iodicedb18a6f2019-05-30 09:53:10 +01002779#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)