blob: 65c31efe2b605342286be7149e438ec53a61ae29 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00002 * Copyright (c) 2017-2019 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010024#include "gemm_helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000026#include "repeat.h"
Gian Marco05288a22017-11-21 10:57:50 +000027
Georgios Pinitasdaa38552018-08-28 17:43:18 +010028#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
29#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010030#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010031#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010032#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010033#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
34#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenac50da382018-07-26 15:50:09 +010035
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010036#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
37
38/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. These macros use the dot8 instruction */
39#define ARM_DOT1(a, b, c) \
40 ({ \
41 ARM_DOT((uchar4)(a, (uchar3)0), (uchar4)(b, (uchar3)0), c); \
42 })
43#define ARM_DOT2(a, b, c) \
44 ({ \
45 ARM_DOT((uchar4)(a, (uchar2)0), (uchar4)(b, (uchar2)0), c); \
46 })
47#define ARM_DOT3(a, b, c) \
48 ({ \
49 ARM_DOT((uchar4)(a, (uchar)0), (uchar4)(b, (uchar)0), c); \
50 })
51#define ARM_DOT4(a, b, c) \
52 ({ \
53 ARM_DOT(a, b, c); \
54 })
55#define ARM_DOT8(a, b, c) \
56 ({ \
57 ARM_DOT4((a.lo), (b.lo), c); \
58 ARM_DOT4((a.hi), (b.hi), c); \
59 })
60#define ARM_DOT16(a, b, c) \
61 ({ \
62 ARM_DOT8((a.lo), (b.lo), c); \
63 ARM_DOT8((a.hi), (b.hi), c); \
64 })
65
66#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
67
68/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
Georgios Pinitas705fd3d2019-06-17 17:23:22 +010069#define ARM_DOT1(a, b, c) \
70 ({ \
71 c += (uint)a * b; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010072 })
73#define ARM_DOT2(a, b, c) \
74 ({ \
Georgios Pinitas705fd3d2019-06-17 17:23:22 +010075 c += (uint)a.s0 * b.s0; \
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010076 c += (uint)a.s1 * b.s1; \
77 })
78#define ARM_DOT3(a, b, c) \
79 ({ \
80 ARM_DOT2(a, b, c); \
81 c += (uint)a.s2 * b.s2; \
82 })
83#define ARM_DOT4(a, b, c) \
84 ({ \
85 ARM_DOT3(a, b, c); \
86 c += (uint)a.s3 * b.s3; \
87 })
88#define ARM_DOT8(a, b, c) \
89 ({ \
90 ARM_DOT4((a.lo), (b.lo), c); \
91 ARM_DOT4((a.hi), (b.hi), c); \
92 })
93#define ARM_DOT16(a, b, c) \
94 ({ \
95 ARM_DOT8((a.lo), (b.lo), c); \
96 ARM_DOT8((a.hi), (b.hi), c); \
97 })
98#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
99
100/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
101#define ARM_DOT_K0X2(k0, a, b, c) \
102 ({ \
103 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
104 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
105 })
106#define ARM_DOT_K0X3(k0, a, b, c) \
107 ({ \
108 ARM_DOT_K0X2(k0, a, b, c); \
109 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
110 })
111#define ARM_DOT_K0X4(k0, a, b, c) \
112 ({ \
113 ARM_DOT_K0X3(k0, a, b, c); \
114 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
115 })
116#define ARM_DOT_K0X8(k0, a, b, c) \
117 ({ \
118 ARM_DOT_K0X4(k0, a, b, c); \
119 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
120 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
121 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
122 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
123 })
124#define ARM_DOT_K0X16(k0, a, b, c) \
125 ({ \
126 ARM_DOT_K0X8(k0, a, b, c); \
127 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
128 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
129 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
130 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
131 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
132 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
133 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
134 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
135 })
136
Georgios Pinitas705fd3d2019-06-17 17:23:22 +0100137/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +0100138#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \
139 ({ \
140 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
141 })
142#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \
143 ({ \
144 ARM_MM_K0XN0X1(n0, k0, a, b, c); \
145 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
146 })
147#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \
148 ({ \
149 ARM_MM_K0XN0X2(n0, k0, a, b, c); \
150 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
151 })
152#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \
153 ({ \
154 ARM_MM_K0XN0X3(n0, k0, a, b, c); \
155 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
156 })
157#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \
158 ({ \
159 ARM_MM_K0XN0X4(n0, k0, a, b, c); \
160 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
161 })
162#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \
163 ({ \
164 ARM_MM_K0XN0X5(n0, k0, a, b, c); \
165 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
166 })
167#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \
168 ({ \
169 ARM_MM_K0XN0X6(n0, k0, a, b, c); \
170 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
171 })
172#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \
173 ({ \
174 ARM_MM_K0XN0X7(n0, k0, a, b, c); \
175 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
176 })
177
178#define ARM_DOT_K0(k0, a, b, c) \
179 ({ \
180 CONCAT(ARM_DOT, k0) \
181 ((a), (b), (c)); \
182 })
183
184#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
185 ({ \
186 CONCAT(ARM_DOT_K0X, n0) \
187 (k0, (a), b, (c)); \
188 })
189
190#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
191 ({ \
192 CONCAT(ARM_MM_K0XN0X, m0) \
193 (n0, k0, a, b, c); \
194 })
195
Gian Marco19835e52018-01-30 13:35:54 +0000196#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000197/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco Iodice5fc07aa2019-05-15 17:08:02 +0100198 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel before running the matrix multiplication
Gian Marco05288a22017-11-21 10:57:50 +0000199 *
Gian Marco19835e52018-01-30 13:35:54 +0000200 * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
201 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
202 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
Gian Marco05288a22017-11-21 10:57:50 +0000203 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100204 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
205 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
206 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
207 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
208 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
209 *
Gian Marco05288a22017-11-21 10:57:50 +0000210 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
211 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
212 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
213 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
214 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
215 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
216 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
217 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
218 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
219 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
220 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
221 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
222 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
223 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
224 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
225 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
226 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
227 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100228 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
229 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
230 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
231 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000232 */
Gian Marco19835e52018-01-30 13:35:54 +0000233__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
234 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100235 IMAGE_DECLARATION(dst),
236 uint src0_stride_z,
237 uint src1_stride_z,
238 uint dst_stride_z
239#if defined(REINTERPRET_OUTPUT_AS_3D)
240 ,
241 uint cross_plane_pad
242#endif // REINTERPRET_OUTPUT_AS_3D
243 )
Gian Marco05288a22017-11-21 10:57:50 +0000244{
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100245 const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
246 const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
247 const int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +0000248
Gian Marco19835e52018-01-30 13:35:54 +0000249 // Offset
250 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
251 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
252
253 // src_addr_a = address of matrix A
254 // src_addr_b = address of matrix B
Isabella Gottardib92805b2018-09-28 18:24:27 +0100255 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
Gian Marco19835e52018-01-30 13:35:54 +0000256 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco05288a22017-11-21 10:57:50 +0000257
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100258#if defined(MATRIX_B_DEPTH)
259 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
260 src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
261#else // defined(MATRIX_B_DEPTH)
262 src_addr_b += z * src1_stride_z;
263#endif // defined(MATRIX_B_DEPTH)
264
Gian Marco05288a22017-11-21 10:57:50 +0000265 // Compute end row address for matrix B
Gian Marco19835e52018-01-30 13:35:54 +0000266 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
267
268 src_addr_a += offset_row_a;
269 src_addr_b += offset_row_b;
Gian Marco05288a22017-11-21 10:57:50 +0000270
271 // Reset accumulators
Gian Marco19835e52018-01-30 13:35:54 +0000272 int4 c00 = 0;
273 int4 c10 = 0;
274 int4 c20 = 0;
275 int4 c30 = 0;
Gian Marco05288a22017-11-21 10:57:50 +0000276
Gian Marco19835e52018-01-30 13:35:54 +0000277 for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000278 {
279 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000280 int4 a0 = convert_int4(vload4(0, src_addr_a));
281 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000282
Gian Marco19835e52018-01-30 13:35:54 +0000283 c00 += (int4)a0.s0 * b0;
284 c10 += (int4)a0.s1 * b0;
285 c20 += (int4)a0.s2 * b0;
286 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000287
Gian Marco19835e52018-01-30 13:35:54 +0000288 a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
289 b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
Gian Marco05288a22017-11-21 10:57:50 +0000290
Gian Marco19835e52018-01-30 13:35:54 +0000291 c00 += (int4)a0.s0 * b0;
292 c10 += (int4)a0.s1 * b0;
293 c20 += (int4)a0.s2 * b0;
294 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000295 }
296
Gian Marco19835e52018-01-30 13:35:54 +0000297 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
Gian Marco05288a22017-11-21 10:57:50 +0000298 {
299 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000300 int4 a0 = convert_int4(vload4(0, src_addr_a));
301 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000302
Gian Marco19835e52018-01-30 13:35:54 +0000303 c00 += (int4)a0.s0 * b0;
304 c10 += (int4)a0.s1 * b0;
305 c20 += (int4)a0.s2 * b0;
306 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000307 }
308
309 // Compute destination address
310 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
311
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100312#if defined(REINTERPRET_OUTPUT_AS_3D)
313 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
314 // in order to take into account the presence of possible cross plane paddings
315 //
316 // | |
317 // | plane0 |
318 // | |
319 // |__________________|
320 // |******************|
321 // | cross_plane_pad |
322 // |******************|
323 // | |
324 // | plane1 |
325 // | |
326 // |__________________|
327
328 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
329 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
330 zout = min(DEPTH_GEMM3D - 1, zout);
331
332 // Add offset due to the cross plane paddings
333 zout *= (cross_plane_pad * dst_stride_y);
334
335 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
336 // multiply dst_stride_z by DEPTH_GEMM3D
337 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
338
Gian Marco19835e52018-01-30 13:35:54 +0000339 // Store 4x4 block
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100340 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
341 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
342 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
343 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
344
345#else // defined(REINTERPRET_OUTPUT_AS_3D)
346 // Add offset for batched GEMM
347 dst.ptr += z * dst_stride_z;
348
349 // Store 4x4 block
350 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
351 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
352 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
353 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
354#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000355}
Gian Marco19835e52018-01-30 13:35:54 +0000356#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000357
358#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
359#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
360#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
361#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
362/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
363 *
364 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
365 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100366 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
367 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
368 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
369 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
370 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
371 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
372 *
Gian Marco05288a22017-11-21 10:57:50 +0000373 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
374 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
375 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
376 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
377 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
378 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
379 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
380 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
381 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
382 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
383 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
384 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
385 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
386 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
387 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
388 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
389 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
390 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100391 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
392 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
393 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
394 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
395 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000396 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000397__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
398 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100399 IMAGE_DECLARATION(dst),
400 uint src0_stride_z,
401 uint src1_stride_z,
402 uint dst_stride_z
403#if defined(REINTERPRET_INPUT_AS_3D)
404 ,
405 uint src_cross_plane_pad
406#endif // REINTERPRET_INPUT_AS_3D
407#if defined(REINTERPRET_OUTPUT_AS_3D)
408 ,
409 uint dst_cross_plane_pad
410#endif // REINTERPRET_OUTPUT_AS_3D
411 )
Gian Marco05288a22017-11-21 10:57:50 +0000412{
413 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
414
415 // Compute starting address for matrix A and Matrix B
416 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
417
418 // Update address for the matrix A
419 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
420
421 // Update address for the matrix B
422 src_addr.s1 += idx;
423
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100424#if defined(REINTERPRET_INPUT_AS_3D)
425 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
426 // in order to take into account the presence of possible cross plane paddings
427 //
428 // | |
429 // | plane0 |
430 // | |
431 // |__________________|
432 // |******************|
433 // | cross_plane_pad |
434 // |******************|
435 // | |
436 // | plane1 |
437 // | |
438 // |__________________|
439
440 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
441 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
442 zin = min(DEPTH_GEMM3D - 1, zin);
443
444 // Add offset due to the cross plane paddings
445 zin *= (src_cross_plane_pad * src0_stride_y);
446
447 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
448 // multiply src0_stride_z by DEPTH_GEMM3D
449 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
450
451#else // defined(REINTERPRET_INPUT_AS_3D)
452
453 // Add offset for batched GEMM
454 src_addr.s0 += get_global_id(2) * src0_stride_z;
455
456#endif // defined(REINTERPRET_INPUT_AS_3D)
457
458#if defined(MATRIX_B_DEPTH)
459 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
460 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
461#else // defined(MATRIX_B_DEPTH)
462 src_addr.s1 += get_global_id(2) * src1_stride_z;
463#endif // defined(MATRIX_B_DEPTH)
464
Gian Marco05288a22017-11-21 10:57:50 +0000465 int end_row_vec_a = src_addr.s0 + COLS_A;
466
467 VECTOR_UINT acc0 = 0;
468#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
469 VECTOR_UINT acc1 = 0;
470#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
471#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
472 VECTOR_UINT acc2 = 0;
473#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
474#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
475 VECTOR_UINT acc3 = 0;
476#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000477#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
478 VECTOR_UINT acc4 = 0;
479#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000480
481 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
482 {
483 // Load values from matrix A
484 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
485#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
486 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
487#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
488#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
489 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
490#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
491#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
492 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
493#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000494#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
495 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
496#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000497 // Load values from matrix B
498 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
499 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
500
501 // Accumulate
502 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
503 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
504#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
505 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
506 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
507#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
508#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
509 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
510 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
511#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
512#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
513 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
514 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
515#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000516#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
517 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
518 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
519#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000520 }
521
522 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
523 {
524 // Load values from matrix A
525 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
526#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
527 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
528#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
529#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
530 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
531#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
532#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
533 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
534#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000535#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
536 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
537#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000538 // Load values from matrix B
539 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
540
541 // Accumulate
542 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
543#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
544 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
545#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
546#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
547 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
548#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
549#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
550 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
551#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000552#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
553 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
554#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000555 }
556
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100557 const int z = get_global_id(2);
558
Gian Marco05288a22017-11-21 10:57:50 +0000559 // Compute destination address
560 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
561
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100562#if defined(REINTERPRET_OUTPUT_AS_3D)
563 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
564 // in order to take into account the presence of possible cross plane paddings
565 //
566 // | |
567 // | plane0 |
568 // | |
569 // |__________________|
570 // |******************|
571 // | cross_plane_pad |
572 // |******************|
573 // | |
574 // | plane1 |
575 // | |
576 // |__________________|
577
578 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
579 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
580 zout = min(DEPTH_GEMM3D - 1, zout);
581
582 // Add offset due to the cross plane paddings
583 zout *= (dst_cross_plane_pad * dst_stride_y);
584
585 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
586 // multiply dst_stride_z by DEPTH_GEMM3D
587 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
588
Gian Marco05288a22017-11-21 10:57:50 +0000589 // Store the result
590 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100591 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco05288a22017-11-21 10:57:50 +0000592#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
593 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100594 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco05288a22017-11-21 10:57:50 +0000595#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
596#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
597 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100598 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco05288a22017-11-21 10:57:50 +0000599#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
600#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
601 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100602 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco05288a22017-11-21 10:57:50 +0000603#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000604#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
605 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100606 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +0000607#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100608
609#else // defined(REINTERPRET_OUTPUT_AS_3D)
610 // Add offset for batched GEMM
611 dst.ptr += z * dst_stride_z;
612
613 // Store the result
614 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
615 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
616#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
617 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
618 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
619#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
620#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
621 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
622 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
623#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
624#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
625 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
626 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
627#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
628#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
629 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
630 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
631#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
632#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +0000633}
634
635/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
636 *
637 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
638 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100639 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
640 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
641 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
642 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
643 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
644 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
645 *
Gian Marco7b4d5472018-01-10 15:56:30 +0000646 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
647 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
648 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
649 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
650 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
651 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
652 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
653 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
654 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
655 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
656 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
657 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
658 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
659 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
660 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
661 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
662 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
663 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100664 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
665 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
666 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
667 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
668 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +0000669 */
670__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
671 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100672 IMAGE_DECLARATION(dst),
673 uint src0_stride_z,
674 uint src1_stride_z,
675 uint dst_stride_z
676#if defined(REINTERPRET_INPUT_AS_3D)
677 ,
678 uint src_cross_plane_pad
679#endif // REINTERPRET_INPUT_AS_3D
680#if defined(REINTERPRET_OUTPUT_AS_3D)
681 ,
682 uint dst_cross_plane_pad
683#endif // REINTERPRET_OUTPUT_AS_3D
684 )
Gian Marco7b4d5472018-01-10 15:56:30 +0000685{
686 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
687
688 // Compute starting address for matrix A and Matrix B
689 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
690
691 // Update address for the matrix A
692 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
693
694 // Update address for the matrix B
695 src_addr.s1 += idx;
696
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100697#if defined(REINTERPRET_INPUT_AS_3D)
698 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
699 // in order to take into account the presence of possible cross plane paddings
700 //
701 // | |
702 // | plane0 |
703 // | |
704 // |__________________|
705 // |******************|
706 // | cross_plane_pad |
707 // |******************|
708 // | |
709 // | plane1 |
710 // | |
711 // |__________________|
712
713 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
714 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
715 zin = min(DEPTH_GEMM3D - 1, zin);
716
717 // Add offset due to the cross plane paddings
718 zin *= (src_cross_plane_pad * src0_stride_y);
719
720 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
721 // multiply src0_stride_z by DEPTH_GEMM3D
722 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
723
724#else // defined(REINTERPRET_INPUT_AS_3D)
725
726 // Add offset for batched GEMM
727 src_addr.s0 += get_global_id(2) * src0_stride_z;
728
729#endif // defined(REINTERPRET_INPUT_AS_3D)
730
731#if defined(MATRIX_B_DEPTH)
732 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
733 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
734#else // defined(MATRIX_B_DEPTH)
735 src_addr.s1 += get_global_id(2) * src1_stride_z;
736#endif // defined(MATRIX_B_DEPTH)
737
Gian Marco7b4d5472018-01-10 15:56:30 +0000738 int end_row_vec_a = src_addr.s0 + COLS_A;
739
740 uint acc00 = 0;
741 uint acc01 = 0;
742 uint acc02 = 0;
743 uint acc03 = 0;
744#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
745 uint acc10 = 0;
746 uint acc11 = 0;
747 uint acc12 = 0;
748 uint acc13 = 0;
749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
750#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
751 uint acc20 = 0;
752 uint acc21 = 0;
753 uint acc22 = 0;
754 uint acc23 = 0;
755#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
756#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
757 uint acc30 = 0;
758 uint acc31 = 0;
759 uint acc32 = 0;
760 uint acc33 = 0;
761#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
762#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
763 uint acc40 = 0;
764 uint acc41 = 0;
765 uint acc42 = 0;
766 uint acc43 = 0;
767#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
768
769 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
770 {
771 // Load values from matrix A
772 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
773#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
774 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
775#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
776#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
777 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
778#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
779#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
780 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
781#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
782#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
783 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
784#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
785 // Load values from matrix B
786 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
787 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
788 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
789 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
790
791 {
792 // Accumulate
793 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
794 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
795 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
796 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
797
798 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
799 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
800 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
801 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
802
803 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
804 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
805 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
806 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
807
808 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
809 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
810 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
811 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
812
813 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
814 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
815 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
816 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
817 }
818#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
819 {
820 // Accumulate
821 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
822 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
823 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
824 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
825
826 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
827 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
828 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
829 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
830
831 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
832 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
833 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
834 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
835
836 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
837 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
838 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
839 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
840
841 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
842 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
843 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
844 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
845 }
846#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
847#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
848 {
849 // Accumulate
850 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
851 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
852 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
853 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
854
855 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
856 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
857 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
858 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
859
860 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
861 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
862 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
863 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
864
865 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
866 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
867 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
868 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
869
870 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
871 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
872 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
873 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
874 }
875#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
876#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
877 {
878 // Accumulate
879 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
880 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
881 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
882 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
883
884 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
885 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
886 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
887 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
888
889 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
890 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
891 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
892 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
893
894 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
895 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
896 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
897 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
898
899 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
900 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
901 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
902 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
903 }
904#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
905#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
906 {
907 // Accumulate
908 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
909 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
910 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
911 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
912
913 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
914 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
915 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
916 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
917
918 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
919 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
920 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
921 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
922
923 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
924 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
925 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
926 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
927
928 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
929 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
930 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
931 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
932 }
933#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
934 }
935
936 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
937 {
938 // Load values from matrix A
939 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
940#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
941 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
942#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
943#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
944 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
945#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
946#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
947 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
948#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
949#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
950 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
951#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
952 // Load values from matrix B
953 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
954
955 // Accumulate
956 {
957 // Accumulate
958 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
959 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
960 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
961 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
962
963 acc00 += ((uint)tmp0);
964 acc01 += ((uint)tmp1);
965 acc02 += ((uint)tmp2);
966 acc03 += ((uint)tmp3);
967 }
968#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
969 {
970 // Accumulate
971 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
972 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
973 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
974 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
975
976 acc10 += ((uint)tmp0);
977 acc11 += ((uint)tmp1);
978 acc12 += ((uint)tmp2);
979 acc13 += ((uint)tmp3);
980 }
981#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
982#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
983 {
984 // Accumulate
985 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
986 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
987 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
988 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
989
990 acc20 += ((uint)tmp0);
991 acc21 += ((uint)tmp1);
992 acc22 += ((uint)tmp2);
993 acc23 += ((uint)tmp3);
994 }
995#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
996#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
997 {
998 // Accumulate
999 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
1000 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
1001 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
1002 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
1003
1004 acc30 += ((uint)tmp0);
1005 acc31 += ((uint)tmp1);
1006 acc32 += ((uint)tmp2);
1007 acc33 += ((uint)tmp3);
1008 }
1009#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1010#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1011 {
1012 // Accumulate
1013 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
1014 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
1015 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
1016 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
1017
1018 acc40 += ((uint)tmp0);
1019 acc41 += ((uint)tmp1);
1020 acc42 += ((uint)tmp2);
1021 acc43 += ((uint)tmp3);
1022 }
1023#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1024 }
1025
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001026 const int z = get_global_id(2);
1027
Gian Marco7b4d5472018-01-10 15:56:30 +00001028 // Compute destination address
1029 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1030
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001031#if defined(REINTERPRET_OUTPUT_AS_3D)
1032 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1033 // in order to take into account the presence of possible cross plane paddings
1034 //
1035 // | |
1036 // | plane0 |
1037 // | |
1038 // |__________________|
1039 // |******************|
1040 // | cross_plane_pad |
1041 // |******************|
1042 // | |
1043 // | plane1 |
1044 // | |
1045 // |__________________|
1046
1047 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1048 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
1049 zout = min(DEPTH_GEMM3D - 1, zout);
1050
1051 // Add offset due to the cross plane paddings
1052 zout *= (dst_cross_plane_pad * dst_stride_y);
1053
1054 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1055 // multiply dst_stride_z by DEPTH_GEMM3D
1056 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
1057
Gian Marco7b4d5472018-01-10 15:56:30 +00001058 // Store the result
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001059 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco7b4d5472018-01-10 15:56:30 +00001060#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001061 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco7b4d5472018-01-10 15:56:30 +00001062#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1063#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001064 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco7b4d5472018-01-10 15:56:30 +00001065#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1066#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001067 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco7b4d5472018-01-10 15:56:30 +00001068#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1069#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001070 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +00001071#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001072
1073#else // defined(REINTERPRET_OUTPUT_AS_3D)
1074 // Add offset for batched GEMM
1075 dst.ptr += z * dst_stride_z;
1076
1077 // Store the result
1078 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
1079#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1080 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
1081#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1082#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1083 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
1084#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1085#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1086 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
1087#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1088#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1089 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
1090#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1091#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +00001092}
Giorgio Arena6200fa42018-07-06 17:06:36 +01001093
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001094#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001095/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1096 *
1097 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1098 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001099 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1100 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1101 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1102 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1103 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1104 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
1105 *
Giorgio Arena6200fa42018-07-06 17:06:36 +01001106 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
1107 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1108 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1109 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1110 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1111 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1112 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
1113 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1114 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1115 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1116 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1117 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1118 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
1119 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1120 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1121 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1122 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1123 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001124 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
1125 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1126 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1127 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
1128 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001129 */
1130__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
1131 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001132 IMAGE_DECLARATION(dst),
1133 uint src0_stride_z,
1134 uint src1_stride_z,
1135 uint dst_stride_z
1136#if defined(REINTERPRET_INPUT_AS_3D)
1137 ,
1138 uint src_cross_plane_pad
1139#endif // REINTERPRET_INPUT_AS_3D
1140#if defined(REINTERPRET_OUTPUT_AS_3D)
1141 ,
1142 uint dst_cross_plane_pad
1143#endif // REINTERPRET_OUTPUT_AS_3D)
1144 )
Giorgio Arena6200fa42018-07-06 17:06:36 +01001145{
1146 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1147
1148 // Compute starting address for matrix A and Matrix B
1149 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1150
1151 // Update address for the matrix A
1152 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1153
1154 // Update address for the matrix B
1155 src_addr.s1 += idx;
1156
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001157#if defined(REINTERPRET_INPUT_AS_3D)
1158 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
1159 // in order to take into account the presence of possible cross plane paddings
1160 //
1161 // | |
1162 // | plane0 |
1163 // | |
1164 // |__________________|
1165 // |******************|
1166 // | cross_plane_pad |
1167 // |******************|
1168 // | |
1169 // | plane1 |
1170 // | |
1171 // |__________________|
1172
1173 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1174 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
1175 zin = min(DEPTH_GEMM3D - 1, zin);
1176
1177 // Add offset due to the cross plane paddings
1178 zin *= (src_cross_plane_pad * src0_stride_y);
1179
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001180 zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;
1181
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001182 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1183 // multiply src0_stride_z by DEPTH_GEMM3D
1184 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
1185
1186#else // defined(REINTERPRET_INPUT_AS_3D)
1187
1188 // Add offset for batched GEMM
1189 src_addr.s0 += get_global_id(2) * src0_stride_z;
1190
1191#endif // defined(REINTERPRET_INPUT_AS_3D)
1192
1193#if defined(MATRIX_B_DEPTH)
1194 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1195 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
1196#else // defined(MATRIX_B_DEPTH)
1197 src_addr.s1 += get_global_id(2) * src1_stride_z;
1198#endif // defined(MATRIX_B_DEPTH)
1199
Giorgio Arena6200fa42018-07-06 17:06:36 +01001200 uint acc00 = 0;
1201 uint acc01 = 0;
1202 uint acc02 = 0;
1203 uint acc03 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001204 uint acc04 = 0;
1205 uint acc05 = 0;
1206 uint acc06 = 0;
1207 uint acc07 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001208#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1209 uint acc10 = 0;
1210 uint acc11 = 0;
1211 uint acc12 = 0;
1212 uint acc13 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001213 uint acc14 = 0;
1214 uint acc15 = 0;
1215 uint acc16 = 0;
1216 uint acc17 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001217#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1218#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1219 uint acc20 = 0;
1220 uint acc21 = 0;
1221 uint acc22 = 0;
1222 uint acc23 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001223 uint acc24 = 0;
1224 uint acc25 = 0;
1225 uint acc26 = 0;
1226 uint acc27 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001227#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1228#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1229 uint acc30 = 0;
1230 uint acc31 = 0;
1231 uint acc32 = 0;
1232 uint acc33 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001233 uint acc34 = 0;
1234 uint acc35 = 0;
1235 uint acc36 = 0;
1236 uint acc37 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001237#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001238
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001239 // A and B src indices get incremented at the same time.
1240 int i = 0;
1241 for(; i <= ((int)COLS_A - 8); i += 8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001242 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001243#if defined(REINTERPRET_INPUT_AS_3D)
1244 // Load values from matrix A and matrix B
1245 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001246#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001247 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001248#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1249#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001250 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001251#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1252#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001253 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001254#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001255#else // defined(REINTERPRET_INPUT_AS_3D)
1256 // Load values from matrix A and matrix B
1257 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1258#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1259 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1260#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1261#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1262 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1264#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1265 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1266#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1267#endif // defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001268
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001269 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1270 uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1271 uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1272 uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1273 src_addr.s1 += 4 * src1_stride_y;
1274
1275 ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1276 ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1277 ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1278 ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1279 ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1280 ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1281 ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1282 ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1283
Giorgio Arena6200fa42018-07-06 17:06:36 +01001284#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001285 ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1286 ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1287 ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1288 ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1289 ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1290 ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1291 ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1292 ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001293#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1294#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001295 ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1296 ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1297 ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1298 ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1299 ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1300 ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1301 ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1302 ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001303#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1304#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001305 ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1306 ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1307 ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1308 ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1309 ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1310 ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1311 ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1312 ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001313#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001314
1315 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1316 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1317 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1318 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1319 src_addr.s1 += 4 * src1_stride_y;
1320
1321 ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1322 ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1323 ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1324 ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1325 ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1326 ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1327 ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1328 ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1329
1330#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1331 ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1332 ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1333 ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1334 ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1335 ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1336 ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1337 ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1338 ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
1339#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1340#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1341 ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1342 ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1343 ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1344 ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1345 ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1346 ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1347 ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1348 ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
1349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1351 ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1352 ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1353 ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1354 ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1355 ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1356 ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1357 ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1358 ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
1359#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1360
1361 src_addr.s0 += 8;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001362 }
1363
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001364 for(; i < (int)COLS_A; ++i)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001365 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001366#if defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001367 // Load values from matrix A
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001368 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001369#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001370 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001371#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1372#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001373 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001374#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1375#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001376 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001377#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001378#else // defined(REINTERPRET_INPUT_AS_3D)
1379 // Load values from matrix A
1380 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1381#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1382 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1383#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1384#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1385 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1386#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1387#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1388 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1389#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1390#endif // defined(REINTERPRET_INPUT_AS_3D)
1391
Giorgio Arena6200fa42018-07-06 17:06:36 +01001392 // Load values from matrix B
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001393 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);
1394 src_addr.s1 += src1_stride_y;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001395
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001396 acc00 += (uint)a0 * b0.s0;
1397 acc01 += (uint)a0 * b0.s1;
1398 acc02 += (uint)a0 * b0.s2;
1399 acc03 += (uint)a0 * b0.s3;
1400 acc04 += (uint)a0 * b0.s4;
1401 acc05 += (uint)a0 * b0.s5;
1402 acc06 += (uint)a0 * b0.s6;
1403 acc07 += (uint)a0 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001404
Giorgio Arena6200fa42018-07-06 17:06:36 +01001405#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001406 acc10 += (uint)a1 * b0.s0;
1407 acc11 += (uint)a1 * b0.s1;
1408 acc12 += (uint)a1 * b0.s2;
1409 acc13 += (uint)a1 * b0.s3;
1410 acc14 += (uint)a1 * b0.s4;
1411 acc15 += (uint)a1 * b0.s5;
1412 acc16 += (uint)a1 * b0.s6;
1413 acc17 += (uint)a1 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001414#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1415#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001416 acc20 += (uint)a2 * b0.s0;
1417 acc21 += (uint)a2 * b0.s1;
1418 acc22 += (uint)a2 * b0.s2;
1419 acc23 += (uint)a2 * b0.s3;
1420 acc24 += (uint)a2 * b0.s4;
1421 acc25 += (uint)a2 * b0.s5;
1422 acc26 += (uint)a2 * b0.s6;
1423 acc27 += (uint)a2 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001424#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1425#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001426 acc30 += (uint)a3 * b0.s0;
1427 acc31 += (uint)a3 * b0.s1;
1428 acc32 += (uint)a3 * b0.s2;
1429 acc33 += (uint)a3 * b0.s3;
1430 acc34 += (uint)a3 * b0.s4;
1431 acc35 += (uint)a3 * b0.s5;
1432 acc36 += (uint)a3 * b0.s6;
1433 acc37 += (uint)a3 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001434#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001435
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001436 src_addr.s0 += 1;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001437 }
1438
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001439 int z = get_global_id(2);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001440
Giorgio Arena6200fa42018-07-06 17:06:36 +01001441 // Compute destination address
1442 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1443
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001444 // Compute dst address
1445 __global uchar *dst_addr = dst.ptr;
1446
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001447#if defined(REINTERPRET_OUTPUT_AS_3D)
1448 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1449 // in order to take into account the presence of possible cross plane paddings
1450 //
1451 // | |
1452 // | plane0 |
1453 // | |
1454 // |__________________|
1455 // |******************|
1456 // | cross_plane_pad |
1457 // |******************|
1458 // | |
1459 // | plane1 |
1460 // | |
1461 // |__________________|
1462
1463 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001464 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001465 zout = min(DEPTH_GEMM3D - 1, zout);
1466
1467 // Add offset due to the cross plane paddings
1468 zout *= (dst_cross_plane_pad * dst_stride_y);
1469
1470 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1471 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001472 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001473
Giorgio Arena6200fa42018-07-06 17:06:36 +01001474 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001475 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
1476 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001477#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001478 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
1479 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001480#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1481#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001482 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
1483 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001484#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1485#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001486 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
1487 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001488#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001489
1490#else // defined(REINTERPRET_OUTPUT_AS_3D)
1491 // Add offset for batched GEMM
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001492 dst_addr += z * dst_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001493
1494 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001495 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));
1496 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001497#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001498 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));
1499 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001500#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1501#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001502 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));
1503 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001504#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1505#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001506 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
1507 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001508#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001509#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1510}
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001511#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00001512#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
1513
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001514#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001515/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001516 * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
1517 * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
1518 *
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001519 * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
1520 * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001521 * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
1522 * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
1523 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
1524 * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
1525 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
1526 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1527 * - M0 = 2, 3, 4, 5, 6, 7, 8
1528 * - N0 = 2, 3, 4, 8, 16
1529 * - K0 = 2, 3, 4, 8, 16
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001530 * - V0 >= 1
1531 * - H0 >= 1
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001532 *
1533 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
1534 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1535 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1536 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1537 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
1538 *
1539 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
1540 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
1541 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1542 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
1543 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1544 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
1545 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1546 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1547 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1548 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1549 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1550 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
1551 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1552 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1553 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1554 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1555 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1556 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1557 * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.
1558 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
1559 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1560 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1561 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1562 */
1563__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
1564 IMAGE_DECLARATION(rhs),
1565 IMAGE_DECLARATION(dst),
1566 uint k,
1567 uint lhs_stride_z,
1568 uint rhs_stride_z,
1569 uint dst_stride_z
1570#if defined(REINTERPRET_OUTPUT_AS_3D)
1571 ,
1572 uint dst_cross_plane_pad
1573#endif // REINTERPRET_OUTPUT_AS_3D
1574 )
1575{
1576 // Block size
1577#define LHS_BLOCK_SIZE ((K0) * (M0))
1578
1579#if defined(LHS_INTERLEAVE)
1580#define LHS_OFFSET_X (K0)
1581#define LHS_STEP_X ((K0) * (V0))
1582#define LHS_STEP_LOOP (1)
1583#else // defined(INTERLEAVE)
1584#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
1585#define LHS_STEP_X (K0)
1586#define LHS_STEP_LOOP (V0)
1587#endif // defined(INTERLEAVE)
1588
1589 // Block size
1590#define RHS_BLOCK_SIZE ((K0) * (N0))
1591
1592 // RHS offset and step X
1593#if defined(RHS_INTERLEAVE)
1594#define RHS_OFFSET_X (K0)
1595#define RHS_STEP_X ((K0) * (H0))
1596#define RHS_STEP_LOOP (1)
1597#else // defined(RHS_INTERLEAVE)
1598#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1599#define RHS_STEP_X (K0)
1600#define RHS_STEP_LOOP (H0)
1601#endif // defined(RHS_INTERLEAVE)
1602
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001603 uint x = get_global_id(0);
1604 uint y = get_global_id(1);
1605 uint z = get_global_id(2);
1606
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001607#if defined(DUMMY_WORK_ITEMS)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001608 if((x * N0 >= N) || (y * M0 >= M))
Gian Marco Iodiceb0c50372019-03-15 10:13:05 +00001609 {
1610 return;
1611 }
1612#endif // defined(DUMMY_WORK_ITEMS)
1613
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001614 // Compute LHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001615 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001616
1617 // Compute RHS matrix address
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001618 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001619
1620#if defined(MATRIX_B_DEPTH)
1621 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001622 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001623#else // defined(MATRIX_B_DEPTH)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001624 rhs_addr += z * rhs_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001625#endif // defined(MATRIX_B_DEPTH)
1626
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001627 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1628 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
1629
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001630 // Initialize the accumulators
1631 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1632
1633 for(int i = 0; i < k; i += K0)
1634 {
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001635 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001636 LOAD_BLOCK(M0, K0, uchar, a, lhs_addr, 0, LHS_STEP_X, zlhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001637
1638 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001639 LOAD_BLOCK(N0, K0, uchar, b, rhs_addr, 0, RHS_STEP_X, zrhs);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001640
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001641 // Partial matrix multiplication M0,N0,K0
1642 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001643
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001644 // Update address
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001645 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
1646 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
1647 }
1648
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001649 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001650
1651 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1652
1653#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001654 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1655 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001656
1657 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1658 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001659 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001660
1661#else // defined(REINTERPRET_OUTPUT_AS_3D)
1662
1663 // Add offset for batched GEMM
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001664 dst_addr += z * dst_stride_z;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001665
1666#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1667
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001668 // Convert and store output block
1669 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001670
1671#undef LHS_BLOCK_SIZE
1672#undef LHS_OFFSET_X
1673#undef LHS_STEP_X
1674#undef RHS_BLOCK_SIZE
1675#undef RHS_OFFSET_X
1676#undef RHS_STEP_X
1677}
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +00001678#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
1679
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001680#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
1681
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001682/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1683 * The LHS matrix is NOT reshaped
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001684 * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001685 *
1686 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
1687 * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
1688 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
1689 * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
1690 * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
1691 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1692 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1693 * - N0 = 2, 3, 4, 8, 16
1694 * - K0 = 2, 3, 4, 8, 16
1695 * - H0 >= 1
1696 *
1697 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1698 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1699 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1700 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1701 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1702 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1703 *
1704 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
1705 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
1706 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1707 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
1708 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1709 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
1710 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1711 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1712 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1713 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1714 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1715 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
1716 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1717 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1718 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1719 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1720 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1721 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1722 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
1723 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1724 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1725 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1726 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1727 */
1728__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
1729 IMAGE_DECLARATION(rhs),
1730 IMAGE_DECLARATION(dst),
1731 uint lhs_stride_z,
1732 uint rhs_stride_z,
1733 uint dst_stride_z
1734#if defined(REINTERPRET_INPUT_AS_3D)
1735 ,
1736 uint lhs_cross_plane_pad
1737#endif // REINTERPRET_INPUT_AS_3D
1738#if defined(REINTERPRET_OUTPUT_AS_3D)
1739 ,
1740 uint dst_cross_plane_pad
1741#endif // REINTERPRET_OUTPUT_AS_3D
1742 )
1743{
1744 // Block size
1745#define RHS_BLOCK_SIZE ((K0) * (N0))
1746
1747 // RHS offset and step X
1748#if defined(RHS_INTERLEAVE)
1749#define RHS_OFFSET_X (K0)
1750#define RHS_STEP_X ((K0) * (H0))
1751#define RHS_STEP_LOOP (1)
1752#else // defined(RHS_INTERLEAVE)
1753#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
1754#define RHS_STEP_X (K0)
1755#define RHS_STEP_LOOP (H0)
1756#endif // defined(RHS_INTERLEAVE)
1757
1758 uint x = get_global_id(0);
1759 uint y = get_global_id(1);
1760 uint z = get_global_id(2);
1761
Gian Marco Iodice86cfffe2019-04-02 11:02:20 +01001762#if defined(DUMMY_WORK_ITEMS)
1763 if((x * N0 >= N) || (y * M0 >= M))
1764 {
1765 return;
1766 }
1767#endif // defined(DUMMY_WORK_ITEMS)
1768
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001769 // Compute LHS matrix address
1770 uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
1771
1772 // Compute RHS matrix address
1773 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
1774
1775#if defined(MATRIX_B_DEPTH)
1776 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1777 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1778#else // defined(MATRIX_B_DEPTH)
1779 rhs_offset += z * rhs_stride_z;
1780#endif // defined(MATRIX_B_DEPTH)
1781
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001782 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1783 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001784
1785#if defined(REINTERPRET_INPUT_AS_3D)
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001786 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1787 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001788
1789 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1790 // multiply lhs_stride_z by DEPTH_GEMM3D
1791 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1792
1793#else // defined(REINTERPRET_INPUT_AS_3D)
1794
1795 // Add offset for batched GEMM
1796 lhs_offset += z * lhs_stride_z;
1797
1798#endif // defined(REINTERPRET_INPUT_AS_3D)
1799
1800 // Initialize the accumulators
1801 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;
1802
1803 for(int i = 0; i < K; i += K0)
1804 {
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001805 // Load values from LHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001806 LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001807
1808 // Load values from RHS matrix
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001809 LOAD_BLOCK(N0, K0, uchar, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001810
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001811 // Partial matrix multiplication M0,N0,K0
1812 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001813
1814 lhs_offset += K0;
1815 rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
1816 }
1817
1818 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
1819
1820 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1821
1822#if defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001823 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001824 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001825
1826 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1827 // multiply dst_stride_z by DEPTH_GEMM3D
1828 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1829
1830#else // defined(REINTERPRET_OUTPUT_AS_3D)
1831
1832 // Add offset for batched GEMM
1833 dst_addr += z * dst_stride_z;
1834
1835#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1836
Gian Marco Iodice43a129e2019-05-14 10:14:08 +01001837 // Convert and store output block
1838 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
Gian Marco Iodice62251f72019-03-11 16:07:12 +00001839
1840#undef RHS_BLOCK_SIZE
1841#undef RHS_OFFSET_X
1842#undef RHS_STEP_X
1843}
1844#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
1845
Gian Marco Iodicee7510622019-06-03 17:28:17 +01001846#if defined(M0) && defined(N0) && defined(K0) && defined(K)
1847
1848/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
1849 * The LHS matrix is NOT reshaped
1850 * The RHS matrix is NOT reshaped
1851 *
1852 * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
1853 * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
1854 * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
1855 * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
1856 * @note Only the following configurations of M0, N0 and K0 are currently supported:
1857 * - M0 = 1, 2, 3, 4, 5, 6, 7, 8
1858 * - N0 = 2, 3, 4, 8, 16
1859 * - K0 = 2, 3, 4, 8, 16
1860 *
1861 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1862 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1863 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1864 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1865 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1866 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
1867 *
1868 * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32
1869 * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)
1870 * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1871 * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)
1872 * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1873 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
1874 * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
1875 * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)
1876 * @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1877 * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)
1878 * @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1879 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
1880 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr
1881 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1882 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
1883 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1884 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
1885 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1886 * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)
1887 * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)
1888 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1889 * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
1890 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
1891 */
1892__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
1893 IMAGE_DECLARATION(rhs),
1894 IMAGE_DECLARATION(dst),
1895 uint lhs_stride_z,
1896 uint rhs_stride_z,
1897 uint dst_stride_z
1898#if defined(REINTERPRET_INPUT_AS_3D)
1899 ,
1900 uint lhs_cross_plane_pad
1901#endif // REINTERPRET_INPUT_AS_3D
1902#if defined(REINTERPRET_OUTPUT_AS_3D)
1903 ,
1904 uint dst_cross_plane_pad
1905#endif // REINTERPRET_OUTPUT_AS_3D
1906 )
1907{
1908 uint x = get_global_id(0);
1909 uint y = get_global_id(1);
1910 uint z = get_global_id(2);
1911
1912#if defined(DUMMY_WORK_ITEMS)
1913 if((x * N0 >= N) || (y * M0 >= M))
1914 {
1915 return;
1916 }
1917#endif // defined(DUMMY_WORK_ITEMS)
1918
1919 // Compute LHS matrix address
1920 uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
1921
1922 // Compute RHS matrix address
1923 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
1924
1925#if defined(MATRIX_B_DEPTH)
1926 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1927 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1928#else // defined(MATRIX_B_DEPTH)
1929 rhs_offset += z * rhs_stride_z;
1930#endif // defined(MATRIX_B_DEPTH)
1931
1932 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
1933 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
1934
1935#if defined(REINTERPRET_INPUT_AS_3D)
1936 // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
1937 CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
1938
1939 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1940 // multiply lhs_stride_z by DEPTH_GEMM3D
1941 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1942
1943#else // defined(REINTERPRET_INPUT_AS_3D)
1944
1945 // Add offset for batched GEMM
1946 lhs_offset += z * lhs_stride_z;
1947
1948#endif // defined(REINTERPRET_INPUT_AS_3D)
1949
1950 // Initialize the accumulators
1951 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
1952
1953 int i = 0;
1954
1955 for(; i <= (K - K0); i += K0)
1956 {
1957 // Load values from LHS matrix
1958 LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1959
1960 // Load values from RHS matrix
1961 LOAD_BLOCK(K0, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
1962
1963 // Transpose the values from RHS matrix
1964 TRANSPOSE_K0XN0(K0, N0, b_t, b);
1965
1966 // Partial matrix multiplication M0,N0,K0
1967 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
1968
1969 // Update the offset
1970 lhs_offset += K0;
1971 rhs_offset += K0 * rhs_stride_y;
1972 }
1973
1974 // Left-over for loop
1975 for(; i < K; ++i)
1976 {
1977 // Load values from LHS matrix
1978 LOAD_BLOCK(M0, 1, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
1979
1980 // Load values from RHS matrix
1981 LOAD_BLOCK(1, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
1982
1983 // Transpose the values from RHS matrix
1984 TRANSPOSE_K0XN0(1, N0, b_t, b);
1985
1986 // Partial matrix multiplication M0,N0,1
1987 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
1988
1989 // Update the offset
1990 lhs_offset += 1;
1991 rhs_offset += rhs_stride_y;
1992 }
1993
1994 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
1995
1996 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
1997
1998#if defined(REINTERPRET_OUTPUT_AS_3D)
1999 // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
2000 CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
2001
2002 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
2003 // multiply dst_stride_z by DEPTH_GEMM3D
2004 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
2005
2006#else // defined(REINTERPRET_OUTPUT_AS_3D)
2007
2008 // Add offset for batched GEMM
2009 dst_addr += z * dst_stride_z;
2010
2011#endif // defined(REINTERPRET_OUTPUT_AS_3D)
2012
2013 // Convert and store output block
2014 CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
2015}
2016#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
2017
Gian Marco05288a22017-11-21 10:57:50 +00002018#if defined(COLS_A)
2019/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
2020 *
2021 * @note This stage is needed to handle the offset of matrix product
2022 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
2023 *
2024 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
2025 *
2026 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
2027 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2028 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2029 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2030 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2031 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2032 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2033 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2034 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
2035 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2036 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2037 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2038 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2039 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2040 */
2041__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
2042 IMAGE_DECLARATION(dst))
2043{
2044 // Compute source and destination addresses
2045 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
2046 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2047
2048 uint4 sum_row_u32 = (uint4)0;
2049 uint sum_row = 0;
2050
2051 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
2052
2053 int i = 0;
2054
2055 // This for loop performs 16 accumulations
2056 for(; i <= ((int)COLS_A - 16); i += 16)
2057 {
2058 const uchar16 a0_u8 = vload16(0, matrix_a + i);
2059
2060 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
2061 }
2062
2063 // This for loop performs the leftover accumulations
2064 for(; i < COLS_A; ++i)
2065 {
2066 sum_row += matrix_a[i];
2067 }
2068
2069 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
2070
2071 *((__global int *)dst.ptr) = (int)sum_row;
2072}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002073
2074#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2075/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
2076 *
2077 * @note This stage is needed to handle the offset of matrix product
2078 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
2079 *
2080 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
2081 *
2082 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
2083 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2084 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2085 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2086 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2087 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2088 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2089 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2090 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
2091 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2092 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2093 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2094 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2095 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2096 */
2097__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
2098 IMAGE_DECLARATION(dst))
2099{
2100 // Compute source and destination addresses
2101 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
2102 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2103
2104 uint sum_row = 0;
2105
2106 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
2107
2108 int i = 0;
2109
2110 // This for loop performs 16 accumulations
2111 for(; i <= ((int)COLS_A - 32); i += 32)
2112 {
2113 uchar16 a0_u8 = vload16(0, matrix_a + i);
2114
2115 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
2116 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
2117 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
2118 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
2119
2120 a0_u8 = vload16(1, matrix_a + i);
2121
2122 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
2123 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
2124 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
2125 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
2126 }
2127
2128 // This for loop performs the leftover accumulations
2129 for(; i < COLS_A; ++i)
2130 {
2131 sum_row += matrix_a[i];
2132 }
2133
2134 *((__global int *)dst.ptr) = (int)sum_row;
2135}
2136#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00002137#endif // defined(COLS_A)
2138
2139#if defined(COLS_B) && defined(ROWS_B)
2140/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
2141 *
2142 * @note This stage is needed to handle the offset of matrix product
2143 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
2144 *
2145 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
2146 *
2147 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
2148 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2149 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2150 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2151 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2152 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2153 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2154 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2155 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
2156 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2157 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2158 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2159 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2160 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2161 */
2162__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
2163 IMAGE_DECLARATION(dst))
2164{
2165 // Compute source and destination addresses
2166 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
2167 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2168
2169 uint16 sum_col_u32 = (uint16)0;
2170
2171 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
2172
2173 int i = 0;
2174 // This for loop performs 4 accumulations
2175 for(; i <= ((int)ROWS_B - 4); i += 4)
2176 {
2177 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
2178 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
2179 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
2180 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
2181
2182 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
2183
2184 matrix_b += 4 * src_stride_y;
2185 }
2186
2187 // This for loop perfoms the leftover accumulations
2188 for(; i < (int)ROWS_B; ++i)
2189 {
2190 const uchar16 b0_u8 = vload16(0, matrix_b);
2191
2192 sum_col_u32 += convert_uint16(b0_u8);
2193
2194 matrix_b += src_stride_y;
2195 }
2196
2197 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
2198}
2199#endif // defined(COLS_B) && defined(ROWS_B)
2200
2201#if defined(K_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002202
2203/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.
2204 *
2205 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2206 * and calculates the offset contribution of matrix A and matrix B.
2207 *
2208 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2209 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2210 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2211 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2212 *
2213 * @param[in] x get_global_id(0) * 4
2214 * @param[in] y get_global_id(1)
2215 * @param[in] z get_global_id(2)
2216 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2217 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2218 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2219 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2220 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2221 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2222 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2223 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2224 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2225 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2226 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2227 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2228 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2229 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2230 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2231 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2232 */
2233inline int4 offset_contribution(
2234 int x,
2235 int y,
2236 int z
2237#if defined(A_OFFSET)
2238 ,
2239 IMAGE_DECLARATION(sum_col)
2240#endif // defined(A_OFFSET)
2241#if defined(B_OFFSET)
2242 ,
2243 IMAGE_DECLARATION(sum_row)
2244#endif // defined(B_OFFSET)
2245#if defined(ADD_BIAS)
2246 ,
2247 VECTOR_DECLARATION(biases)
2248#endif // defined(ADD_BIAS)
2249)
2250{
2251 int4 a_offset_s32 = (int4)0;
2252 int4 b_offset_s32 = (int4)0;
2253
2254 int batch_id = z;
2255#if defined(DEPTH_INPUT3D)
2256 batch_id /= (int)DEPTH_INPUT3D;
2257#endif // defined(DEPTH_INPUT3D)
2258
2259#if defined(A_OFFSET)
2260 // Compute the offset contribution due to A_OFFSET
2261 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
2262
2263 // Compute the offset contribution due to A_OFFSET
2264#if defined(SUM_COL_HAS_BATCHES)
2265 a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
2266#else // defined(SUM_COL_HAS_BATCHES)
2267 a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
2268#endif // defined(SUM_COL_HAS_BATCHES)
2269
2270 a_offset_s32 *= (int4)A_OFFSET;
2271#endif // defined(A_OFFSET)
2272
2273#if defined(B_OFFSET)
2274 // Compute the offset contribution due to A_OFFSET
2275 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
2276
2277 // Compute the offset contribution due to B_OFFSET
2278#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2279 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
2280#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2281 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
2282#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2283 b_offset_s32 *= (int4)B_OFFSET;
2284#endif // defined(B_OFFSET)
2285
2286#if defined(ADD_BIAS)
2287 // Add bias
2288 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2289
2290 int4 biases_values = vload4(0, (__global int *)bias_addr);
2291 b_offset_s32 += (int4)biases_values;
2292#endif // defined(ADD_BIAS)
2293
2294 return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
2295}
2296
Gian Marco05288a22017-11-21 10:57:50 +00002297/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
2298 *
2299 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2300 * and adds to it the offset contribution of matrix A and matrix B in-place.
2301 *
2302 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2303 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2304 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07002305 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00002306 *
2307 * The final result is:
2308 *
2309 * mm_result[i][k] = mm_result[i][k] +
2310 * (sum_col[k] * A_OFFSET) +
2311 * (sum_row[i] * B_OFFSET) +
2312 * (K_OFFSET)
2313 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002314 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2315 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2316 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2317 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2318 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2319 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2320 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2321 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002322 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2323 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2324 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2325 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2326 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2327 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2328 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2329 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2330 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2331 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2332 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2333 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2334 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2335 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2336 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2337 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002338 */
2339__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
2340#if defined(A_OFFSET)
2341 ,
2342 IMAGE_DECLARATION(sum_col)
2343#endif // defined(A_OFFSET)
2344#if defined(B_OFFSET)
2345 ,
2346 IMAGE_DECLARATION(sum_row)
2347#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002348#if defined(ADD_BIAS)
2349 ,
2350 VECTOR_DECLARATION(biases)
2351#endif // defined(ADD_BIAS))
Gian Marco05288a22017-11-21 10:57:50 +00002352 )
2353{
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002354 const int x = get_global_id(0) * 4;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002355 const int y = get_global_id(1);
2356 const int z = get_global_id(2);
2357
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002358 // Compute offset contribution
2359 int4 offset_term_s32 = offset_contribution(
2360 x, y, z
Gian Marco05288a22017-11-21 10:57:50 +00002361#if defined(A_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002362 ,
2363 sum_col_ptr,
2364 sum_col_stride_x,
2365 sum_col_step_x,
2366 sum_col_stride_y,
2367 sum_col_step_y,
2368 sum_col_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002369#endif // defined(A_OFFSET)
Gian Marco05288a22017-11-21 10:57:50 +00002370#if defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002371 ,
2372 sum_row_ptr,
2373 sum_row_stride_x,
2374 sum_row_step_x,
2375 sum_row_stride_y,
2376 sum_row_step_y,
2377 sum_row_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002378#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002379#if defined(ADD_BIAS)
2380 ,
2381 biases_ptr,
2382 biases_stride_x,
2383 biases_step_x,
2384 biases_offset_first_element_in_bytes
2385#endif // defined(ADD_BIAS)
2386 );
Gian Marco05288a22017-11-21 10:57:50 +00002387
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002388 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002389
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002390 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002391
2392 // Add the offset terms to GEMM's result
2393 in_s32 += offset_term_s32;
2394
2395 // Store the result with the offset contribution
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002396 vstore4(in_s32, 0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002397}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002398
2399#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
2400/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2401 *
2402 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2403 *
2404 *
2405 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2406 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2407 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2408 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2409 *
2410 * The result before the output stage is:
2411 *
2412 * mm_result[i][k] = mm_result[i][k] +
2413 * (sum_col[k] * A_OFFSET) +
2414 * (sum_row[i] * B_OFFSET) +
2415 * (K_OFFSET)
2416 *
2417 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2418 *
2419 * -# Add offset terms to final result
2420 * -# Multiply each entry of result by result_mult_int
2421 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2422 * -# Shift the int32 accumulator by result_shift
2423 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2424 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2425 *
2426 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2427 *
2428 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2429 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2430 * These values can be used to implement "rectified linear unit" activation functions
2431 *
2432 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2433 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2434 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2435 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2436 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2437 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2438 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2439 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2440 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2441 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2442 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2443 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2444 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2445 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2446 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2447 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2448 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2449 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2450 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2451 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2452 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2453 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2454 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2455 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2456 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2457 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2458 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2459 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2460 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2461 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2462 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2463 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2464 */
2465__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
2466#if defined(A_OFFSET)
2467 ,
2468 IMAGE_DECLARATION(sum_col)
2469#endif // defined(A_OFFSET)
2470#if defined(B_OFFSET)
2471 ,
2472 IMAGE_DECLARATION(sum_row)
2473#endif // defined(B_OFFSET)
2474 ,
2475#if defined(ADD_BIAS)
2476 VECTOR_DECLARATION(biases),
2477#endif // defined(ADD_BIAS)
2478 TENSOR3D_DECLARATION(dst))
2479{
2480 const int x = get_global_id(0) * 4;
2481 const int y = get_global_id(1);
2482 const int z = get_global_id(2);
2483
2484 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2485
2486 // Compute offset contribution
2487 int4 offset_term_s32 = offset_contribution(
2488 x, y, z
2489#if defined(A_OFFSET)
2490 ,
2491 sum_col_ptr,
2492 sum_col_stride_x,
2493 sum_col_step_x,
2494 sum_col_stride_y,
2495 sum_col_step_y,
2496 sum_col_offset_first_element_in_bytes
2497#endif // defined(A_OFFSET)
2498#if defined(B_OFFSET)
2499 ,
2500 sum_row_ptr,
2501 sum_row_stride_x,
2502 sum_row_step_x,
2503 sum_row_stride_y,
2504 sum_row_step_y,
2505 sum_row_offset_first_element_in_bytes
2506#endif // defined(B_OFFSET)
2507#if defined(ADD_BIAS)
2508 ,
2509 biases_ptr,
2510 biases_stride_x,
2511 biases_step_x,
2512 biases_offset_first_element_in_bytes
2513#endif // defined(ADD_BIAS)
2514 );
2515
2516 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2517
2518 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2519
2520 // Add the offset terms to GEMM's result
2521 in_s32 += offset_term_s32;
2522
2523 // -------------- OUTPUT STAGE
2524
2525 // Add the offset terms to GEMM's result
2526 in_s32 += (int4)RESULT_OFFSET;
2527
2528 // Multiply by result_mult_int and shift
2529 in_s32 *= RESULT_MULTIPLIER;
2530
2531 in_s32 >>= RESULT_SHIFT;
2532
2533 uchar4 res = convert_uchar4_sat(in_s32);
2534
2535#if defined(MIN_BOUND)
2536 res = max(res, (uchar4)MIN_BOUND);
2537#endif // defined(MIN_BOUND)
2538#if defined(MAX_BOUND)
2539 res = min(res, (uchar4)MAX_BOUND);
2540#endif // defined(MAX_BOUND)
2541
2542 // Store the result
2543 vstore4(res, 0, dst_addr);
2544}
2545
2546/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2547 *
2548 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2549 *
2550 *
2551 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2552 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2553 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2554 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2555 *
2556 * The result before the output stage is:
2557 *
2558 * mm_result[i][k] = mm_result[i][k] +
2559 * (sum_col[k] * A_OFFSET) +
2560 * (sum_row[i] * B_OFFSET) +
2561 * (K_OFFSET)
2562 *
2563 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2564 *
2565 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2566 * -# Add bias to final result if bias tensor is not a nullptr
2567 * -# Round to nearest division by a power-of-two using result_shift
2568 * -# Add offset to each result
2569 * -# Clamp the value between the specified min and max bounds
2570 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2571 *
2572 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2573 *
2574 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2575 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2576 * These values can be used to implement "rectified linear unit" activation functions
2577 *
2578 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2579 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2580 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2581 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2582 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2583 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2584 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2585 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2586 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2587 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2588 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2589 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2590 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2591 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2592 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2593 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2594 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2595 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2596 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2597 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2598 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2599 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2600 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2601 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2602 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2603 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2604 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2605 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2606 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2607 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2608 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2609 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2610 */
2611__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
2612#if defined(A_OFFSET)
2613 ,
2614 IMAGE_DECLARATION(sum_col)
2615#endif // defined(A_OFFSET)
2616#if defined(B_OFFSET)
2617 ,
2618 IMAGE_DECLARATION(sum_row)
2619#endif // defined(B_OFFSET)
2620 ,
2621#if defined(ADD_BIAS)
2622 VECTOR_DECLARATION(biases),
2623#endif // defined(ADD_BIAS)
2624 TENSOR3D_DECLARATION(dst))
2625{
2626 const int x = get_global_id(0) * 4;
2627 const int y = get_global_id(1);
2628 const int z = get_global_id(2);
2629
2630 // Compute offset contribution
2631 int4 offset_term_s32 = offset_contribution(
2632 x, y, z
2633#if defined(A_OFFSET)
2634 ,
2635 sum_col_ptr,
2636 sum_col_stride_x,
2637 sum_col_step_x,
2638 sum_col_stride_y,
2639 sum_col_step_y,
2640 sum_col_offset_first_element_in_bytes
2641#endif // defined(A_OFFSET)
2642#if defined(B_OFFSET)
2643 ,
2644 sum_row_ptr,
2645 sum_row_stride_x,
2646 sum_row_step_x,
2647 sum_row_stride_y,
2648 sum_row_step_y,
2649 sum_row_offset_first_element_in_bytes
2650#endif // defined(B_OFFSET)
2651#if defined(ADD_BIAS)
2652 ,
2653 biases_ptr,
2654 biases_stride_x,
2655 biases_step_x,
2656 biases_offset_first_element_in_bytes
2657#endif // defined(ADD_BIAS)
2658 );
2659
2660 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2661
2662 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2663
2664 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2665
2666 // Add the offset terms to GEMM's result
2667 in_s32 += offset_term_s32;
2668
2669 // -------------- OUTPUT STAGE
2670
2671 // Multiply by result_mult_int and shift
2672 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
2673
2674 // Add the offset terms to GEMM's result
2675 in_s32 += (int4)RESULT_OFFSET;
2676
2677 uchar4 res = convert_uchar4_sat(in_s32);
2678
2679#if defined(MIN_BOUND)
2680 res = max(res, (uchar4)MIN_BOUND);
2681#endif // defined(MIN_BOUND)
2682#if defined(MAX_BOUND)
2683 res = min(res, (uchar4)MAX_BOUND);
2684#endif // defined(MAX_BOUND)
2685
2686 // Store the result
2687 vstore4(res, 0, dst_addr);
2688}
2689#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
Gian Marco05288a22017-11-21 10:57:50 +00002690#endif // defined(K_OFFSET)
2691
2692#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2693/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2694 *
2695 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
2696 * The following computations will be performed by the kernel:
2697 *
2698 * -# Add offset terms to final result
2699 * -# Multiply each entry of result by result_mult_int
2700 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2701 * -# Shift the int32 accumulator by result_shift
2702 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2703 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2704 *
2705 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2706 *
2707 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2708 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2709 * These values can be used to implement "rectified linear unit" activation functions
2710 *
2711 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2712 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2713 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2714 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2715 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2716 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2717 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2718 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002719 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2720 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2721 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2722 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002723 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2724 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2725 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2726 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2727 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2728 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2729 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2730 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2731 */
2732__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
2733#if defined(ADD_BIAS)
2734 VECTOR_DECLARATION(biases),
2735#endif // defined(ADD_BIAS)
2736 TENSOR3D_DECLARATION(dst))
2737{
2738 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002739 int x = get_global_id(0) * 4;
2740 int y = get_global_id(1);
2741 int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +00002742
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002743 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002744
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002745 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2746
2747 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002748
Gian Marco05288a22017-11-21 10:57:50 +00002749#if defined(ADD_BIAS)
2750 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002751 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2752
2753 int4 biases_values = vload4(0, (__global int *)bias_addr);
2754 input_values += (int4)biases_values;
Gian Marco05288a22017-11-21 10:57:50 +00002755#endif // defined(ADD_BIAS)
2756
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002757 // Add the offset terms to GEMM's result
2758 input_values += (int4)RESULT_OFFSET;
2759
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00002760 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00002761 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00002762
Gian Marco58c57942017-11-28 09:10:03 +00002763 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +00002764
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002765 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco05288a22017-11-21 10:57:50 +00002766
2767#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002768 res = max(res, (uchar4)MIN_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002769#endif // defined(MIN_BOUND)
2770#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002771 res = min(res, (uchar4)MAX_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002772#endif // defined(MAX_BOUND)
2773
2774 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002775 vstore4(res, 0, dst_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002776}
Gian Marco58c57942017-11-28 09:10:03 +00002777#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2778
2779#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
2780/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2781 *
2782 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2783 * The following computations will be performed by the kernel:
2784 *
2785 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2786 * -# Add bias to final result if bias tensor is not a nullptr
2787 * -# Round to nearest division by a power-of-two using result_shift
2788 * -# Add offset to each result
2789 * -# Clamp the value between the specified min and max bounds
2790 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2791 *
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002792 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
Gian Marco58c57942017-11-28 09:10:03 +00002793 *
2794 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2795 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2796 * These values can be used to implement "rectified linear unit" activation functions
2797 *
2798 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2799 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2800 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2801 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2802 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2803 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2804 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2805 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002806 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2807 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2808 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2809 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco58c57942017-11-28 09:10:03 +00002810 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2811 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2812 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2813 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2814 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2815 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2816 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2817 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2818 */
2819__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
2820#if defined(ADD_BIAS)
2821 VECTOR_DECLARATION(biases),
2822#endif // defined(ADD_BIAS)
2823 TENSOR3D_DECLARATION(dst))
2824{
2825 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002826 int x = get_global_id(0) * 4;
2827 int y = get_global_id(1);
2828 int z = get_global_id(2);
Georgios Pinitas932491f2018-09-21 16:33:15 +01002829
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002830 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco58c57942017-11-28 09:10:03 +00002831
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002832 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2833
2834 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002835
2836#if defined(ADD_BIAS)
2837 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002838 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2839
2840 int4 biases_values = vload4(0, (__global int *)bias_addr);
2841 input_values += (int4)biases_values;
Gian Marco58c57942017-11-28 09:10:03 +00002842#endif // defined(ADD_BIAS)
2843
2844 // Multiply by result_mult_int and shift
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002845 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Gian Marco58c57942017-11-28 09:10:03 +00002846
2847 // Add the offset terms to GEMM's result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002848 input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
Gian Marco58c57942017-11-28 09:10:03 +00002849
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002850 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco58c57942017-11-28 09:10:03 +00002851
2852#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002853 res = max(res, (uchar4)MIN_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002854#endif // defined(MIN_BOUND)
2855#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002856 res = min(res, (uchar4)MAX_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002857#endif // defined(MAX_BOUND)
2858
2859 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002860 vstore4(res, 0, dst_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002861}
Chunosov5124be52017-11-22 20:42:13 +07002862#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002863
2864#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
2865/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2866 *
2867 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2868 * The following computations will be performed by the kernel:
2869 *
2870 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2871 * -# Add bias to final result if bias tensor is not a nullptr
2872 * -# Requantize
2873 * -# Add offset to each result
2874 * -# Clamp the value between the specified min and max bounds
2875 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2876 *
2877 * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
2878 *
2879 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2880 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2881 * These values can be used to implement "rectified linear unit" activation functions
2882 *
2883 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2884 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2885 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2886 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2887 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2888 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2889 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2890 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2891 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
2892 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
2893 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
2894 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
2895 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2896 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2897 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2898 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2899 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2900 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2901 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2902 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2903 * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
2904 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2905 */
2906__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
2907#if defined(ADD_BIAS)
2908 VECTOR_DECLARATION(biases),
2909#endif // defined(ADD_BIAS)
2910#if defined(DST_HEIGHT)
2911 TENSOR4D_DECLARATION(dst))
2912#else // defined(DST_HEIGHT)
2913 TENSOR3D_DECLARATION(dst))
2914#endif // defined(DST_HEIGHT)
2915{
2916 // Compute source and destination addresses
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002917 int x = get_global_id(0) * 4;
2918 int y = get_global_id(1);
2919 int z = get_global_id(2);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002920
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002921 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002922
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002923 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2924
2925 int4 input_values = vload4(0, (__global int *)src_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002926
2927#if defined(ADD_BIAS)
2928 // Add bias
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002929 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2930
2931 int4 biases_values = vload4(0, (__global int *)bias_addr);
2932 input_values += (int4)biases_values;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002933#endif // defined(ADD_BIAS)
2934
2935 // Convert to float
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002936 float16 input_values_f = convert_float4(input_values);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002937 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
2938
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002939 uchar4 res = convert_uchar4_sat(input_values_f);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002940
2941#if defined(MIN_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002942 res = max(res, (uchar4)MIN_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002943#endif // defined(MIN_BOUND)
2944#if defined(MAX_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002945 res = min(res, (uchar4)MAX_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002946#endif // defined(MAX_BOUND)
2947
2948 // Store the result
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002949 vstore4(res, 0, dst_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002950}
Gian Marco Iodicedb18a6f2019-05-30 09:53:10 +01002951#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)