blob: 8c1fa548e4a4754d37e6dd504c100ae1659e1184 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco7b4d5472018-01-10 15:56:30 +00002 * Copyright (c) 2017-2018 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco05288a22017-11-21 10:57:50 +000026
Georgios Pinitasdaa38552018-08-28 17:43:18 +010027#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
28#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010029#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010030#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Gian Marco Iodice4b908652018-10-18 10:21:02 +010031#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
Georgios Pinitasdaa38552018-08-28 17:43:18 +010032#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
33#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arenac50da382018-07-26 15:50:09 +010034
Gian Marco19835e52018-01-30 13:35:54 +000035#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +000036/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco19835e52018-01-30 13:35:54 +000037 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
Gian Marco05288a22017-11-21 10:57:50 +000038 *
Gian Marco19835e52018-01-30 13:35:54 +000039 * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
40 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
41 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
Gian Marco05288a22017-11-21 10:57:50 +000042 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010043 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
44 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
45 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
46 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
47 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
48 *
Gian Marco05288a22017-11-21 10:57:50 +000049 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
50 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
51 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
52 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
53 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
54 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
55 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
56 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
57 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
58 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
59 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
60 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
61 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
62 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
63 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
64 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
65 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
66 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010067 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
68 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
69 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
70 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +000071 */
Gian Marco19835e52018-01-30 13:35:54 +000072__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
73 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010074 IMAGE_DECLARATION(dst),
75 uint src0_stride_z,
76 uint src1_stride_z,
77 uint dst_stride_z
78#if defined(REINTERPRET_OUTPUT_AS_3D)
79 ,
80 uint cross_plane_pad
81#endif // REINTERPRET_OUTPUT_AS_3D
82 )
Gian Marco05288a22017-11-21 10:57:50 +000083{
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010084 const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
85 const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
86 const int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +000087
Gian Marco19835e52018-01-30 13:35:54 +000088 // Offset
89 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
90 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
91
92 // src_addr_a = address of matrix A
93 // src_addr_b = address of matrix B
Isabella Gottardib92805b2018-09-28 18:24:27 +010094 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
Gian Marco19835e52018-01-30 13:35:54 +000095 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco05288a22017-11-21 10:57:50 +000096
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +010097#if defined(MATRIX_B_DEPTH)
98 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
99 src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
100#else // defined(MATRIX_B_DEPTH)
101 src_addr_b += z * src1_stride_z;
102#endif // defined(MATRIX_B_DEPTH)
103
Gian Marco05288a22017-11-21 10:57:50 +0000104 // Compute end row address for matrix B
Gian Marco19835e52018-01-30 13:35:54 +0000105 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
106
107 src_addr_a += offset_row_a;
108 src_addr_b += offset_row_b;
Gian Marco05288a22017-11-21 10:57:50 +0000109
110 // Reset accumulators
Gian Marco19835e52018-01-30 13:35:54 +0000111 int4 c00 = 0;
112 int4 c10 = 0;
113 int4 c20 = 0;
114 int4 c30 = 0;
Gian Marco05288a22017-11-21 10:57:50 +0000115
Gian Marco19835e52018-01-30 13:35:54 +0000116 for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000117 {
118 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000119 int4 a0 = convert_int4(vload4(0, src_addr_a));
120 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000121
Gian Marco19835e52018-01-30 13:35:54 +0000122 c00 += (int4)a0.s0 * b0;
123 c10 += (int4)a0.s1 * b0;
124 c20 += (int4)a0.s2 * b0;
125 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000126
Gian Marco19835e52018-01-30 13:35:54 +0000127 a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
128 b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
Gian Marco05288a22017-11-21 10:57:50 +0000129
Gian Marco19835e52018-01-30 13:35:54 +0000130 c00 += (int4)a0.s0 * b0;
131 c10 += (int4)a0.s1 * b0;
132 c20 += (int4)a0.s2 * b0;
133 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000134 }
135
Gian Marco19835e52018-01-30 13:35:54 +0000136 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
Gian Marco05288a22017-11-21 10:57:50 +0000137 {
138 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000139 int4 a0 = convert_int4(vload4(0, src_addr_a));
140 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000141
Gian Marco19835e52018-01-30 13:35:54 +0000142 c00 += (int4)a0.s0 * b0;
143 c10 += (int4)a0.s1 * b0;
144 c20 += (int4)a0.s2 * b0;
145 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000146 }
147
148 // Compute destination address
149 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
150
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100151#if defined(REINTERPRET_OUTPUT_AS_3D)
152 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
153 // in order to take into account the presence of possible cross plane paddings
154 //
155 // | |
156 // | plane0 |
157 // | |
158 // |__________________|
159 // |******************|
160 // | cross_plane_pad |
161 // |******************|
162 // | |
163 // | plane1 |
164 // | |
165 // |__________________|
166
167 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
168 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
169 zout = min(DEPTH_GEMM3D - 1, zout);
170
171 // Add offset due to the cross plane paddings
172 zout *= (cross_plane_pad * dst_stride_y);
173
174 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
175 // multiply dst_stride_z by DEPTH_GEMM3D
176 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
177
Gian Marco19835e52018-01-30 13:35:54 +0000178 // Store 4x4 block
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100179 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
180 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
181 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
182 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
183
184#else // defined(REINTERPRET_OUTPUT_AS_3D)
185 // Add offset for batched GEMM
186 dst.ptr += z * dst_stride_z;
187
188 // Store 4x4 block
189 vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
190 vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
191 vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
192 vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
193#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000194}
Gian Marco19835e52018-01-30 13:35:54 +0000195
196/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
197 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
198 *
199 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
200 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
201 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
202 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100203 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
204 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
205 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
206 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
207 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
208 *
Gian Marco19835e52018-01-30 13:35:54 +0000209 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
210 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
211 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
212 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
213 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
214 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
215 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
216 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
217 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
218 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
219 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
220 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
221 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
222 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
223 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
224 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
225 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
226 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100227 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
228 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
229 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
230 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco19835e52018-01-30 13:35:54 +0000231 */
232__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
233 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100234 IMAGE_DECLARATION(dst),
235 uint src0_stride_z,
236 uint src1_stride_z,
237 uint dst_stride_z
238#if defined(REINTERPRET_OUTPUT_AS_3D)
239 ,
240 uint cross_plane_pad
241#endif // REINTERPRET_OUTPUT_AS_3D
242 )
Gian Marco19835e52018-01-30 13:35:54 +0000243{
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100244 const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
245 const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
246 const int z = get_global_id(2);
Gian Marco19835e52018-01-30 13:35:54 +0000247
248 // Offset
249 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
250 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
251
252 // src_addr_a = address of matrix A
253 // src_addr_b = address of matrix B
Isabella Gottardib92805b2018-09-28 18:24:27 +0100254 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
Gian Marco19835e52018-01-30 13:35:54 +0000255 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
256
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100257#if defined(MATRIX_B_DEPTH)
258 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
259 src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
260#else // defined(MATRIX_B_DEPTH)
261 src_addr_b += z * src1_stride_z;
262#endif // defined(MATRIX_B_DEPTH)
263
Gian Marco19835e52018-01-30 13:35:54 +0000264 // Compute end row address for matrix B
265 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
266
267 src_addr_a += offset_row_a;
268 src_addr_b += offset_row_b;
269
270 // Reset accumulators
271 uint c00 = 0;
272 uint c01 = 0;
273 uint c02 = 0;
274 uint c03 = 0;
275 uint c10 = 0;
276 uint c11 = 0;
277 uint c12 = 0;
278 uint c13 = 0;
279 uint c20 = 0;
280 uint c21 = 0;
281 uint c22 = 0;
282 uint c23 = 0;
283 uint c30 = 0;
284 uint c31 = 0;
285 uint c32 = 0;
286 uint c33 = 0;
287
288#if MULT_INTERLEAVE4X4_HEIGHT == 1
289 for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
290 {
291 // Load values from matrix A (interleaved) and matrix B (transposed)
292 uchar16 a0 = vload16(0, src_addr_a);
293 uchar4 b0 = vload4(0, src_addr_b);
294
295 c00 += (ushort)a0.s0 * b0.s0;
296 c01 += (ushort)a0.s0 * b0.s1;
297 c02 += (ushort)a0.s0 * b0.s2;
298 c03 += (ushort)a0.s0 * b0.s3;
299
300 c10 += (ushort)a0.s1 * b0.s0;
301 c11 += (ushort)a0.s1 * b0.s1;
302 c12 += (ushort)a0.s1 * b0.s2;
303 c13 += (ushort)a0.s1 * b0.s3;
304
305 c20 += (ushort)a0.s2 * b0.s0;
306 c21 += (ushort)a0.s2 * b0.s1;
307 c22 += (ushort)a0.s2 * b0.s2;
308 c23 += (ushort)a0.s2 * b0.s3;
309
310 c30 += (ushort)a0.s3 * b0.s0;
311 c31 += (ushort)a0.s3 * b0.s1;
312 c32 += (ushort)a0.s3 * b0.s2;
313 c33 += (ushort)a0.s3 * b0.s3;
314
315 // Load values from matrix B (transposed)
316 b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
317
318 c00 += (ushort)a0.s4 * b0.s0;
319 c01 += (ushort)a0.s4 * b0.s1;
320 c02 += (ushort)a0.s4 * b0.s2;
321 c03 += (ushort)a0.s4 * b0.s3;
322
323 c10 += (ushort)a0.s5 * b0.s0;
324 c11 += (ushort)a0.s5 * b0.s1;
325 c12 += (ushort)a0.s5 * b0.s2;
326 c13 += (ushort)a0.s5 * b0.s3;
327
328 c20 += (ushort)a0.s6 * b0.s0;
329 c21 += (ushort)a0.s6 * b0.s1;
330 c22 += (ushort)a0.s6 * b0.s2;
331 c23 += (ushort)a0.s6 * b0.s3;
332
333 c30 += (ushort)a0.s7 * b0.s0;
334 c31 += (ushort)a0.s7 * b0.s1;
335 c32 += (ushort)a0.s7 * b0.s2;
336 c33 += (ushort)a0.s7 * b0.s3;
337
338 // Load values from matrix B (transposed)
339 b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
340
341 c00 += (ushort)a0.s8 * b0.s0;
342 c01 += (ushort)a0.s8 * b0.s1;
343 c02 += (ushort)a0.s8 * b0.s2;
344 c03 += (ushort)a0.s8 * b0.s3;
345
346 c10 += (ushort)a0.s9 * b0.s0;
347 c11 += (ushort)a0.s9 * b0.s1;
348 c12 += (ushort)a0.s9 * b0.s2;
349 c13 += (ushort)a0.s9 * b0.s3;
350
351 c20 += (ushort)a0.sA * b0.s0;
352 c21 += (ushort)a0.sA * b0.s1;
353 c22 += (ushort)a0.sA * b0.s2;
354 c23 += (ushort)a0.sA * b0.s3;
355
356 c30 += (ushort)a0.sB * b0.s0;
357 c31 += (ushort)a0.sB * b0.s1;
358 c32 += (ushort)a0.sB * b0.s2;
359 c33 += (ushort)a0.sB * b0.s3;
360
361 // Load values from matrix B (transposed)
362 b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
363
364 c00 += (ushort)a0.sC * b0.s0;
365 c01 += (ushort)a0.sC * b0.s1;
366 c02 += (ushort)a0.sC * b0.s2;
367 c03 += (ushort)a0.sC * b0.s3;
368
369 c10 += (ushort)a0.sD * b0.s0;
370 c11 += (ushort)a0.sD * b0.s1;
371 c12 += (ushort)a0.sD * b0.s2;
372 c13 += (ushort)a0.sD * b0.s3;
373
374 c20 += (ushort)a0.sE * b0.s0;
375 c21 += (ushort)a0.sE * b0.s1;
376 c22 += (ushort)a0.sE * b0.s2;
377 c23 += (ushort)a0.sE * b0.s3;
378
379 c30 += (ushort)a0.sF * b0.s0;
380 c31 += (ushort)a0.sF * b0.s1;
381 c32 += (ushort)a0.sF * b0.s2;
382 c33 += (ushort)a0.sF * b0.s3;
383
384 // Load values from matrix A (interleaved) and matrix B (transposed)
385 a0 = vload16(0, src_addr_a + 16);
386 b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
387
388 c00 += (ushort)a0.s0 * b0.s0;
389 c01 += (ushort)a0.s0 * b0.s1;
390 c02 += (ushort)a0.s0 * b0.s2;
391 c03 += (ushort)a0.s0 * b0.s3;
392
393 c10 += (ushort)a0.s1 * b0.s0;
394 c11 += (ushort)a0.s1 * b0.s1;
395 c12 += (ushort)a0.s1 * b0.s2;
396 c13 += (ushort)a0.s1 * b0.s3;
397
398 c20 += (ushort)a0.s2 * b0.s0;
399 c21 += (ushort)a0.s2 * b0.s1;
400 c22 += (ushort)a0.s2 * b0.s2;
401 c23 += (ushort)a0.s2 * b0.s3;
402
403 c30 += (ushort)a0.s3 * b0.s0;
404 c31 += (ushort)a0.s3 * b0.s1;
405 c32 += (ushort)a0.s3 * b0.s2;
406 c33 += (ushort)a0.s3 * b0.s3;
407
408 // Load values from matrix B (transposed)
409 b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
410
411 c00 += (ushort)a0.s4 * b0.s0;
412 c01 += (ushort)a0.s4 * b0.s1;
413 c02 += (ushort)a0.s4 * b0.s2;
414 c03 += (ushort)a0.s4 * b0.s3;
415
416 c10 += (ushort)a0.s5 * b0.s0;
417 c11 += (ushort)a0.s5 * b0.s1;
418 c12 += (ushort)a0.s5 * b0.s2;
419 c13 += (ushort)a0.s5 * b0.s3;
420
421 c20 += (ushort)a0.s6 * b0.s0;
422 c21 += (ushort)a0.s6 * b0.s1;
423 c22 += (ushort)a0.s6 * b0.s2;
424 c23 += (ushort)a0.s6 * b0.s3;
425
426 c30 += (ushort)a0.s7 * b0.s0;
427 c31 += (ushort)a0.s7 * b0.s1;
428 c32 += (ushort)a0.s7 * b0.s2;
429 c33 += (ushort)a0.s7 * b0.s3;
430
431 // Load values from matrix B (transposed)
432 b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
433
434 c00 += (ushort)a0.s8 * b0.s0;
435 c01 += (ushort)a0.s8 * b0.s1;
436 c02 += (ushort)a0.s8 * b0.s2;
437 c03 += (ushort)a0.s8 * b0.s3;
438
439 c10 += (ushort)a0.s9 * b0.s0;
440 c11 += (ushort)a0.s9 * b0.s1;
441 c12 += (ushort)a0.s9 * b0.s2;
442 c13 += (ushort)a0.s9 * b0.s3;
443
444 c20 += (ushort)a0.sA * b0.s0;
445 c21 += (ushort)a0.sA * b0.s1;
446 c22 += (ushort)a0.sA * b0.s2;
447 c23 += (ushort)a0.sA * b0.s3;
448
449 c30 += (ushort)a0.sB * b0.s0;
450 c31 += (ushort)a0.sB * b0.s1;
451 c32 += (ushort)a0.sB * b0.s2;
452 c33 += (ushort)a0.sB * b0.s3;
453
454 // Load values from matrix B (transposed)
455 b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
456
457 c00 += (ushort)a0.sC * b0.s0;
458 c01 += (ushort)a0.sC * b0.s1;
459 c02 += (ushort)a0.sC * b0.s2;
460 c03 += (ushort)a0.sC * b0.s3;
461
462 c10 += (ushort)a0.sD * b0.s0;
463 c11 += (ushort)a0.sD * b0.s1;
464 c12 += (ushort)a0.sD * b0.s2;
465 c13 += (ushort)a0.sD * b0.s3;
466
467 c20 += (ushort)a0.sE * b0.s0;
468 c21 += (ushort)a0.sE * b0.s1;
469 c22 += (ushort)a0.sE * b0.s2;
470 c23 += (ushort)a0.sE * b0.s3;
471
472 c30 += (ushort)a0.sF * b0.s0;
473 c31 += (ushort)a0.sF * b0.s1;
474 c32 += (ushort)a0.sF * b0.s2;
475 c33 += (ushort)a0.sF * b0.s3;
476 }
477#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
478
479 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
480 {
481 // Load values from matrix A (interleaved) and matrix B (transposed)
482 uchar4 a0 = vload4(0, src_addr_a);
483 uchar4 b0 = vload4(0, src_addr_b);
484
485 c00 += (ushort)a0.s0 * b0.s0;
486 c01 += (ushort)a0.s0 * b0.s1;
487 c02 += (ushort)a0.s0 * b0.s2;
488 c03 += (ushort)a0.s0 * b0.s3;
489
490 c10 += (ushort)a0.s1 * b0.s0;
491 c11 += (ushort)a0.s1 * b0.s1;
492 c12 += (ushort)a0.s1 * b0.s2;
493 c13 += (ushort)a0.s1 * b0.s3;
494
495 c20 += (ushort)a0.s2 * b0.s0;
496 c21 += (ushort)a0.s2 * b0.s1;
497 c22 += (ushort)a0.s2 * b0.s2;
498 c23 += (ushort)a0.s2 * b0.s3;
499
500 c30 += (ushort)a0.s3 * b0.s0;
501 c31 += (ushort)a0.s3 * b0.s1;
502 c32 += (ushort)a0.s3 * b0.s2;
503 c33 += (ushort)a0.s3 * b0.s3;
504 }
505
506 // Compute destination address
507 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
508
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100509#if defined(REINTERPRET_OUTPUT_AS_3D)
510 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
511 // in order to take into account the presence of possible cross plane paddings
512 //
513 // | |
514 // | plane0 |
515 // | |
516 // |__________________|
517 // |******************|
518 // | cross_plane_pad |
519 // |******************|
520 // | |
521 // | plane1 |
522 // | |
523 // |__________________|
524
525 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
526 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
527 zout = min(DEPTH_GEMM3D - 1, zout);
528
529 // Add offset due to the cross plane paddings
530 zout *= (cross_plane_pad * dst_stride_y);
531
532 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
533 // multiply dst_stride_z by DEPTH_GEMM3D
534 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
535
Gian Marco19835e52018-01-30 13:35:54 +0000536 // Store 4x4 block
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100537 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
538 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
539 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
540 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
541
542#else // defined(REINTERPRET_OUTPUT_AS_3D)
543 // Add offset for batched GEMM
544 dst.ptr += z * dst_stride_z;
545
546 // Store 4x4 block
547 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
548 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
549 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
550 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
551#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco19835e52018-01-30 13:35:54 +0000552}
Giorgio Arena6200fa42018-07-06 17:06:36 +0100553
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100554#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100555/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
556 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
557 *
558 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
559 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
560 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
561 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100562 * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
563 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
564 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
565 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
566 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
567 *
Giorgio Arena6200fa42018-07-06 17:06:36 +0100568 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
569 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
570 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
571 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
572 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
573 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
574 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
575 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
576 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
577 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
578 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
579 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
580 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
581 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
582 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
583 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
584 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
585 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100586 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
587 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
588 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
589 * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100590 */
591__kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),
592 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100593 IMAGE_DECLARATION(dst),
594 uint src0_stride_z,
595 uint src1_stride_z,
596 uint dst_stride_z
597#if defined(REINTERPRET_OUTPUT_AS_3D)
598 ,
599 uint cross_plane_pad
600#endif // REINTERPRET_OUTPUT_AS_3D
601 )
Giorgio Arena6200fa42018-07-06 17:06:36 +0100602{
Giorgio Arena6200fa42018-07-06 17:06:36 +0100603 // Offset
604 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
605 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
606
607 // src_addr_a = address of matrix A
608 // src_addr_b = address of matrix B
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100609 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + (get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT) * src0_stride_y + get_global_id(2) * src0_stride_z + src0_offset_first_element_in_bytes);
610 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + (get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP) * src1_stride_y + src1_offset_first_element_in_bytes);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100611
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100612#if defined(MATRIX_B_DEPTH)
613 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100614 src_addr_b += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100615#else // defined(MATRIX_B_DEPTH)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100616 src_addr_b += get_global_id(2) * src1_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100617#endif // defined(MATRIX_B_DEPTH)
618
Giorgio Arena6200fa42018-07-06 17:06:36 +0100619 src_addr_a += offset_row_a;
620 src_addr_b += offset_row_b;
621
622 // Reset accumulators
623 uint c00 = 0;
624 uint c01 = 0;
625 uint c02 = 0;
626 uint c03 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100627
Giorgio Arena6200fa42018-07-06 17:06:36 +0100628 uint c10 = 0;
629 uint c11 = 0;
630 uint c12 = 0;
631 uint c13 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100632
Giorgio Arena6200fa42018-07-06 17:06:36 +0100633 uint c20 = 0;
634 uint c21 = 0;
635 uint c22 = 0;
636 uint c23 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100637
Giorgio Arena6200fa42018-07-06 17:06:36 +0100638 uint c30 = 0;
639 uint c31 = 0;
640 uint c32 = 0;
641 uint c33 = 0;
642
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100643#define COLS_MTX_B (COLS_B / (16 * MULT_TRANSPOSE1XW_WIDTH))
644
Giorgio Arena6200fa42018-07-06 17:06:36 +0100645#if MULT_INTERLEAVE4X4_HEIGHT == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100646 int i = 0;
647 for(; i <= (int)(COLS_MTX_B - 8); i += 8)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100648 {
649 // Load values from matrix A (interleaved) and matrix B (transposed)
650 uchar16 a0 = vload16(0, src_addr_a);
651 uchar4 b0 = vload4(0, src_addr_b);
652 uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
653 uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
654 uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100655 uchar4 b4 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
656 uchar4 b5 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
657 uchar4 b6 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
658 uchar4 b7 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100659
660 // Accumulate
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100661 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c00);
662 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c01);
663 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c02);
664 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c03);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100665
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100666 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c10);
667 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c11);
668 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c12);
669 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c13);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100670
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100671 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c20);
672 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c21);
673 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c22);
674 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c23);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100675
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100676 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c30);
677 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c31);
678 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c32);
679 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c33);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100680
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100681 // Accumulate
Giorgio Arena6200fa42018-07-06 17:06:36 +0100682 a0 = vload16(0, src_addr_a + 16);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100683
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100684 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c00);
685 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c01);
686 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c02);
687 ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c03);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100688
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100689 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c10);
690 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c11);
691 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c12);
692 ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c13);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100693
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100694 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c20);
695 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c21);
696 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c22);
697 ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c23);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100698
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100699 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c30);
700 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c31);
701 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c32);
702 ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c33);
703
704 src_addr_a += 32;
705 src_addr_b += 32 * TRANSPOSE1XW_WIDTH_STEP;
Giorgio Arena6200fa42018-07-06 17:06:36 +0100706 }
707#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100708 int i_left_over = 0;
709 for(; i < (int)(COLS_MTX_B); ++i)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100710 {
711 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100712 uchar16 a0 = vload16(0, src_addr_a + (i_left_over % 4) + ((i_left_over / 4) * 16));
Gian Marco Iodice0c54a622018-10-30 12:20:03 +0000713 uchar4 b0 = vload4(0, src_addr_b);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100714
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100715 c00 += a0.s0 * b0.s0;
716 c01 += a0.s0 * b0.s1;
717 c02 += a0.s0 * b0.s2;
718 c03 += a0.s0 * b0.s3;
Giorgio Arena6200fa42018-07-06 17:06:36 +0100719
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100720 c10 += a0.s4 * b0.s0;
721 c11 += a0.s4 * b0.s1;
722 c12 += a0.s4 * b0.s2;
723 c13 += a0.s4 * b0.s3;
Giorgio Arena6200fa42018-07-06 17:06:36 +0100724
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100725 c20 += a0.s8 * b0.s0;
726 c21 += a0.s8 * b0.s1;
727 c22 += a0.s8 * b0.s2;
728 c23 += a0.s8 * b0.s3;
Giorgio Arena6200fa42018-07-06 17:06:36 +0100729
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100730 c30 += a0.sC * b0.s0;
731 c31 += a0.sC * b0.s1;
732 c32 += a0.sC * b0.s2;
733 c33 += a0.sC * b0.s3;
734
735 i_left_over++;
736 src_addr_b += 4 * TRANSPOSE1XW_WIDTH_STEP;
Giorgio Arena6200fa42018-07-06 17:06:36 +0100737 }
738
739 // Compute destination address
740 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
741
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100742#if defined(REINTERPRET_OUTPUT_AS_3D)
743 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
744 // in order to take into account the presence of possible cross plane paddings
745 //
746 // | |
747 // | plane0 |
748 // | |
749 // |__________________|
750 // |******************|
751 // | cross_plane_pad |
752 // |******************|
753 // | |
754 // | plane1 |
755 // | |
756 // |__________________|
757
758 // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
759 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
760 zout = min(DEPTH_GEMM3D - 1, zout);
761
762 // Add offset due to the cross plane paddings
763 zout *= (cross_plane_pad * dst_stride_y);
764
765 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
766 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100767 dst.ptr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100768
Giorgio Arena6200fa42018-07-06 17:06:36 +0100769 // Store 4x4 block
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100770 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
771 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
772 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
773 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
774
775#else // defined(REINTERPRET_OUTPUT_AS_3D)
776 // Add offset for batched GEMM
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100777 dst.ptr += get_global_id(2) * dst_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100778
779 // Store 4x4 block
780 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
781 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
782 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
783 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
784#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100785}
Georgios Pinitasdaa38552018-08-28 17:43:18 +0100786#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arena6200fa42018-07-06 17:06:36 +0100787
Gian Marco19835e52018-01-30 13:35:54 +0000788#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000789
790#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
791#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
792#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
793#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
794/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
795 *
796 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
797 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100798 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
799 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
800 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
801 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
802 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
803 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
804 *
Gian Marco05288a22017-11-21 10:57:50 +0000805 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
806 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
807 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
808 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
809 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
810 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
811 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
812 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
813 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
814 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
815 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
816 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
817 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
818 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
819 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
820 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
821 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
822 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100823 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
824 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
825 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
826 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
827 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +0000828 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000829__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
830 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100831 IMAGE_DECLARATION(dst),
832 uint src0_stride_z,
833 uint src1_stride_z,
834 uint dst_stride_z
835#if defined(REINTERPRET_INPUT_AS_3D)
836 ,
837 uint src_cross_plane_pad
838#endif // REINTERPRET_INPUT_AS_3D
839#if defined(REINTERPRET_OUTPUT_AS_3D)
840 ,
841 uint dst_cross_plane_pad
842#endif // REINTERPRET_OUTPUT_AS_3D
843 )
Gian Marco05288a22017-11-21 10:57:50 +0000844{
845 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
846
847 // Compute starting address for matrix A and Matrix B
848 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
849
850 // Update address for the matrix A
851 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
852
853 // Update address for the matrix B
854 src_addr.s1 += idx;
855
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100856#if defined(REINTERPRET_INPUT_AS_3D)
857 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
858 // in order to take into account the presence of possible cross plane paddings
859 //
860 // | |
861 // | plane0 |
862 // | |
863 // |__________________|
864 // |******************|
865 // | cross_plane_pad |
866 // |******************|
867 // | |
868 // | plane1 |
869 // | |
870 // |__________________|
871
872 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
873 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
874 zin = min(DEPTH_GEMM3D - 1, zin);
875
876 // Add offset due to the cross plane paddings
877 zin *= (src_cross_plane_pad * src0_stride_y);
878
879 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
880 // multiply src0_stride_z by DEPTH_GEMM3D
881 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
882
883#else // defined(REINTERPRET_INPUT_AS_3D)
884
885 // Add offset for batched GEMM
886 src_addr.s0 += get_global_id(2) * src0_stride_z;
887
888#endif // defined(REINTERPRET_INPUT_AS_3D)
889
890#if defined(MATRIX_B_DEPTH)
891 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
892 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
893#else // defined(MATRIX_B_DEPTH)
894 src_addr.s1 += get_global_id(2) * src1_stride_z;
895#endif // defined(MATRIX_B_DEPTH)
896
Gian Marco05288a22017-11-21 10:57:50 +0000897 int end_row_vec_a = src_addr.s0 + COLS_A;
898
899 VECTOR_UINT acc0 = 0;
900#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
901 VECTOR_UINT acc1 = 0;
902#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
903#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
904 VECTOR_UINT acc2 = 0;
905#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
906#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
907 VECTOR_UINT acc3 = 0;
908#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000909#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
910 VECTOR_UINT acc4 = 0;
911#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000912
913 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
914 {
915 // Load values from matrix A
916 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
917#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
918 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
919#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
920#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
921 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
922#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
923#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
924 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
925#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000926#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
927 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
928#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000929 // Load values from matrix B
930 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
931 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
932
933 // Accumulate
934 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
935 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
936#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
937 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
938 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
939#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
940#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
941 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
942 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
943#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
944#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
945 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
946 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
947#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000948#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
949 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
950 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
951#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000952 }
953
954 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
955 {
956 // Load values from matrix A
957 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
958#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
959 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
960#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
961#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
962 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
963#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
964#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
965 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
966#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000967#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
968 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
969#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000970 // Load values from matrix B
971 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
972
973 // Accumulate
974 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
975#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
976 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
977#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
978#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
979 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
980#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
981#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
982 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
983#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000984#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
985 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
986#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000987 }
988
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100989 const int z = get_global_id(2);
990
Gian Marco05288a22017-11-21 10:57:50 +0000991 // Compute destination address
992 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
993
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100994#if defined(REINTERPRET_OUTPUT_AS_3D)
995 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
996 // in order to take into account the presence of possible cross plane paddings
997 //
998 // | |
999 // | plane0 |
1000 // | |
1001 // |__________________|
1002 // |******************|
1003 // | cross_plane_pad |
1004 // |******************|
1005 // | |
1006 // | plane1 |
1007 // | |
1008 // |__________________|
1009
1010 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1011 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
1012 zout = min(DEPTH_GEMM3D - 1, zout);
1013
1014 // Add offset due to the cross plane paddings
1015 zout *= (dst_cross_plane_pad * dst_stride_y);
1016
1017 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1018 // multiply dst_stride_z by DEPTH_GEMM3D
1019 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
1020
Gian Marco05288a22017-11-21 10:57:50 +00001021 // Store the result
1022 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001023 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco05288a22017-11-21 10:57:50 +00001024#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1025 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001026 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco05288a22017-11-21 10:57:50 +00001027#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1028#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1029 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001030 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco05288a22017-11-21 10:57:50 +00001031#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1032#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1033 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001034 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco05288a22017-11-21 10:57:50 +00001035#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +00001036#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1037 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001038 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +00001039#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001040
1041#else // defined(REINTERPRET_OUTPUT_AS_3D)
1042 // Add offset for batched GEMM
1043 dst.ptr += z * dst_stride_z;
1044
1045 // Store the result
1046 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1047 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
1048#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1049 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1050 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
1051#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1052#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1053 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1054 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
1055#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1056#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1057 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1058 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
1059#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1060#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1061 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
1062 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
1063#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1064#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +00001065}
1066
1067/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1068 *
1069 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1070 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001071 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1072 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1073 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1074 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1075 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1076 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
1077 *
Gian Marco7b4d5472018-01-10 15:56:30 +00001078 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
1079 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1080 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1081 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1082 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1083 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1084 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
1085 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1086 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1087 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1088 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1089 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1090 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
1091 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1092 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1093 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1094 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1095 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001096 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
1097 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1098 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1099 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
1100 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Gian Marco7b4d5472018-01-10 15:56:30 +00001101 */
1102__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
1103 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001104 IMAGE_DECLARATION(dst),
1105 uint src0_stride_z,
1106 uint src1_stride_z,
1107 uint dst_stride_z
1108#if defined(REINTERPRET_INPUT_AS_3D)
1109 ,
1110 uint src_cross_plane_pad
1111#endif // REINTERPRET_INPUT_AS_3D
1112#if defined(REINTERPRET_OUTPUT_AS_3D)
1113 ,
1114 uint dst_cross_plane_pad
1115#endif // REINTERPRET_OUTPUT_AS_3D
1116 )
Gian Marco7b4d5472018-01-10 15:56:30 +00001117{
1118 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1119
1120 // Compute starting address for matrix A and Matrix B
1121 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1122
1123 // Update address for the matrix A
1124 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1125
1126 // Update address for the matrix B
1127 src_addr.s1 += idx;
1128
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001129#if defined(REINTERPRET_INPUT_AS_3D)
1130 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
1131 // in order to take into account the presence of possible cross plane paddings
1132 //
1133 // | |
1134 // | plane0 |
1135 // | |
1136 // |__________________|
1137 // |******************|
1138 // | cross_plane_pad |
1139 // |******************|
1140 // | |
1141 // | plane1 |
1142 // | |
1143 // |__________________|
1144
1145 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1146 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
1147 zin = min(DEPTH_GEMM3D - 1, zin);
1148
1149 // Add offset due to the cross plane paddings
1150 zin *= (src_cross_plane_pad * src0_stride_y);
1151
1152 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1153 // multiply src0_stride_z by DEPTH_GEMM3D
1154 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
1155
1156#else // defined(REINTERPRET_INPUT_AS_3D)
1157
1158 // Add offset for batched GEMM
1159 src_addr.s0 += get_global_id(2) * src0_stride_z;
1160
1161#endif // defined(REINTERPRET_INPUT_AS_3D)
1162
1163#if defined(MATRIX_B_DEPTH)
1164 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1165 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
1166#else // defined(MATRIX_B_DEPTH)
1167 src_addr.s1 += get_global_id(2) * src1_stride_z;
1168#endif // defined(MATRIX_B_DEPTH)
1169
Gian Marco7b4d5472018-01-10 15:56:30 +00001170 int end_row_vec_a = src_addr.s0 + COLS_A;
1171
1172 uint acc00 = 0;
1173 uint acc01 = 0;
1174 uint acc02 = 0;
1175 uint acc03 = 0;
1176#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1177 uint acc10 = 0;
1178 uint acc11 = 0;
1179 uint acc12 = 0;
1180 uint acc13 = 0;
1181#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1182#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1183 uint acc20 = 0;
1184 uint acc21 = 0;
1185 uint acc22 = 0;
1186 uint acc23 = 0;
1187#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1188#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1189 uint acc30 = 0;
1190 uint acc31 = 0;
1191 uint acc32 = 0;
1192 uint acc33 = 0;
1193#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1194#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1195 uint acc40 = 0;
1196 uint acc41 = 0;
1197 uint acc42 = 0;
1198 uint acc43 = 0;
1199#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1200
1201 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
1202 {
1203 // Load values from matrix A
1204 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
1205#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1206 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
1207#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1208#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1209 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
1210#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1211#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1212 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
1213#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1214#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1215 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
1216#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1217 // Load values from matrix B
1218 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1219 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1220 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1221 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1222
1223 {
1224 // Accumulate
1225 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
1226 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
1227 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
1228 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
1229
1230 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
1231 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
1232 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
1233 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
1234
1235 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
1236 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
1237 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
1238 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
1239
1240 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
1241 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
1242 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
1243 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
1244
1245 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
1246 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
1247 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
1248 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
1249 }
1250#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1251 {
1252 // Accumulate
1253 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
1254 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
1255 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
1256 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
1257
1258 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
1259 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
1260 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
1261 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
1262
1263 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
1264 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
1265 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
1266 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
1267
1268 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
1269 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
1270 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
1271 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
1272
1273 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
1274 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
1275 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
1276 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
1277 }
1278#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1279#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1280 {
1281 // Accumulate
1282 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
1283 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
1284 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
1285 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
1286
1287 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
1288 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
1289 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
1290 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
1291
1292 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
1293 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
1294 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
1295 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
1296
1297 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
1298 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
1299 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
1300 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
1301
1302 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
1303 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
1304 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
1305 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
1306 }
1307#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1308#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1309 {
1310 // Accumulate
1311 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
1312 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
1313 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
1314 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
1315
1316 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
1317 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
1318 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
1319 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
1320
1321 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
1322 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
1323 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
1324 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
1325
1326 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
1327 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
1328 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
1329 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
1330
1331 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
1332 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
1333 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
1334 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
1335 }
1336#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1337#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1338 {
1339 // Accumulate
1340 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
1341 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
1342 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
1343 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
1344
1345 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
1346 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
1347 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
1348 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
1349
1350 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
1351 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
1352 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
1353 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
1354
1355 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
1356 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
1357 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
1358 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
1359
1360 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
1361 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
1362 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
1363 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
1364 }
1365#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1366 }
1367
1368 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
1369 {
1370 // Load values from matrix A
1371 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
1372#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1373 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
1374#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1375#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1376 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
1377#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1378#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1379 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
1380#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1381#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1382 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
1383#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1384 // Load values from matrix B
1385 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
1386
1387 // Accumulate
1388 {
1389 // Accumulate
1390 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
1391 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
1392 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
1393 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
1394
1395 acc00 += ((uint)tmp0);
1396 acc01 += ((uint)tmp1);
1397 acc02 += ((uint)tmp2);
1398 acc03 += ((uint)tmp3);
1399 }
1400#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1401 {
1402 // Accumulate
1403 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
1404 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
1405 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
1406 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
1407
1408 acc10 += ((uint)tmp0);
1409 acc11 += ((uint)tmp1);
1410 acc12 += ((uint)tmp2);
1411 acc13 += ((uint)tmp3);
1412 }
1413#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1414#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1415 {
1416 // Accumulate
1417 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
1418 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
1419 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
1420 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
1421
1422 acc20 += ((uint)tmp0);
1423 acc21 += ((uint)tmp1);
1424 acc22 += ((uint)tmp2);
1425 acc23 += ((uint)tmp3);
1426 }
1427#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1428#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1429 {
1430 // Accumulate
1431 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
1432 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
1433 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
1434 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
1435
1436 acc30 += ((uint)tmp0);
1437 acc31 += ((uint)tmp1);
1438 acc32 += ((uint)tmp2);
1439 acc33 += ((uint)tmp3);
1440 }
1441#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1442#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1443 {
1444 // Accumulate
1445 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
1446 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
1447 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
1448 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
1449
1450 acc40 += ((uint)tmp0);
1451 acc41 += ((uint)tmp1);
1452 acc42 += ((uint)tmp2);
1453 acc43 += ((uint)tmp3);
1454 }
1455#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1456 }
1457
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001458 const int z = get_global_id(2);
1459
Gian Marco7b4d5472018-01-10 15:56:30 +00001460 // Compute destination address
1461 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1462
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001463#if defined(REINTERPRET_OUTPUT_AS_3D)
1464 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1465 // in order to take into account the presence of possible cross plane paddings
1466 //
1467 // | |
1468 // | plane0 |
1469 // | |
1470 // |__________________|
1471 // |******************|
1472 // | cross_plane_pad |
1473 // |******************|
1474 // | |
1475 // | plane1 |
1476 // | |
1477 // |__________________|
1478
1479 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1480 uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
1481 zout = min(DEPTH_GEMM3D - 1, zout);
1482
1483 // Add offset due to the cross plane paddings
1484 zout *= (dst_cross_plane_pad * dst_stride_y);
1485
1486 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1487 // multiply dst_stride_z by DEPTH_GEMM3D
1488 dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
1489
Gian Marco7b4d5472018-01-10 15:56:30 +00001490 // Store the result
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001491 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
Gian Marco7b4d5472018-01-10 15:56:30 +00001492#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001493 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
Gian Marco7b4d5472018-01-10 15:56:30 +00001494#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1495#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001496 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
Gian Marco7b4d5472018-01-10 15:56:30 +00001497#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1498#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001499 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
Gian Marco7b4d5472018-01-10 15:56:30 +00001500#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1501#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001502 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
Gian Marco7b4d5472018-01-10 15:56:30 +00001503#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001504
1505#else // defined(REINTERPRET_OUTPUT_AS_3D)
1506 // Add offset for batched GEMM
1507 dst.ptr += z * dst_stride_z;
1508
1509 // Store the result
1510 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
1511#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1512 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
1513#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1514#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1515 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
1516#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1517#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1518 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
1519#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1520#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1521 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
1522#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1523#endif // defined(REINTERPRET_OUTPUT_AS_3D)
Gian Marco05288a22017-11-21 10:57:50 +00001524}
Giorgio Arena6200fa42018-07-06 17:06:36 +01001525
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001526#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001527/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1528 *
1529 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1530 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001531 * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
1532 * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
1533 * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
1534 * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
1535 * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
1536 * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
1537 *
Giorgio Arena6200fa42018-07-06 17:06:36 +01001538 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
1539 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1540 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1541 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1542 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1543 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1544 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
1545 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1546 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1547 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1548 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1549 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1550 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
1551 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1552 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1553 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1554 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1555 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001556 * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
1557 * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
1558 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
1559 * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
1560 * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001561 */
1562__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
1563 IMAGE_DECLARATION(src1),
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001564 IMAGE_DECLARATION(dst),
1565 uint src0_stride_z,
1566 uint src1_stride_z,
1567 uint dst_stride_z
1568#if defined(REINTERPRET_INPUT_AS_3D)
1569 ,
1570 uint src_cross_plane_pad
1571#endif // REINTERPRET_INPUT_AS_3D
1572#if defined(REINTERPRET_OUTPUT_AS_3D)
1573 ,
1574 uint dst_cross_plane_pad
1575#endif // REINTERPRET_OUTPUT_AS_3D)
1576 )
Giorgio Arena6200fa42018-07-06 17:06:36 +01001577{
1578 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1579
1580 // Compute starting address for matrix A and Matrix B
1581 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1582
1583 // Update address for the matrix A
1584 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1585
1586 // Update address for the matrix B
1587 src_addr.s1 += idx;
1588
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001589#if defined(REINTERPRET_INPUT_AS_3D)
1590 // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
1591 // in order to take into account the presence of possible cross plane paddings
1592 //
1593 // | |
1594 // | plane0 |
1595 // | |
1596 // |__________________|
1597 // |******************|
1598 // | cross_plane_pad |
1599 // |******************|
1600 // | |
1601 // | plane1 |
1602 // | |
1603 // |__________________|
1604
1605 // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
1606 uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
1607 zin = min(DEPTH_GEMM3D - 1, zin);
1608
1609 // Add offset due to the cross plane paddings
1610 zin *= (src_cross_plane_pad * src0_stride_y);
1611
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001612 zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;
1613
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001614 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1615 // multiply src0_stride_z by DEPTH_GEMM3D
1616 src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
1617
1618#else // defined(REINTERPRET_INPUT_AS_3D)
1619
1620 // Add offset for batched GEMM
1621 src_addr.s0 += get_global_id(2) * src0_stride_z;
1622
1623#endif // defined(REINTERPRET_INPUT_AS_3D)
1624
1625#if defined(MATRIX_B_DEPTH)
1626 // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
1627 src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
1628#else // defined(MATRIX_B_DEPTH)
1629 src_addr.s1 += get_global_id(2) * src1_stride_z;
1630#endif // defined(MATRIX_B_DEPTH)
1631
Giorgio Arena6200fa42018-07-06 17:06:36 +01001632 uint acc00 = 0;
1633 uint acc01 = 0;
1634 uint acc02 = 0;
1635 uint acc03 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001636 uint acc04 = 0;
1637 uint acc05 = 0;
1638 uint acc06 = 0;
1639 uint acc07 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001640#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1641 uint acc10 = 0;
1642 uint acc11 = 0;
1643 uint acc12 = 0;
1644 uint acc13 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001645 uint acc14 = 0;
1646 uint acc15 = 0;
1647 uint acc16 = 0;
1648 uint acc17 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001649#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1650#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1651 uint acc20 = 0;
1652 uint acc21 = 0;
1653 uint acc22 = 0;
1654 uint acc23 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001655 uint acc24 = 0;
1656 uint acc25 = 0;
1657 uint acc26 = 0;
1658 uint acc27 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001659#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1660#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1661 uint acc30 = 0;
1662 uint acc31 = 0;
1663 uint acc32 = 0;
1664 uint acc33 = 0;
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001665 uint acc34 = 0;
1666 uint acc35 = 0;
1667 uint acc36 = 0;
1668 uint acc37 = 0;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001669#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001670
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001671 // A and B src indices get incremented at the same time.
1672 int i = 0;
1673 for(; i <= ((int)COLS_A - 8); i += 8)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001674 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001675#if defined(REINTERPRET_INPUT_AS_3D)
1676 // Load values from matrix A and matrix B
1677 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001678#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001679 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001680#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1681#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001682 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001683#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1684#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001685 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001686#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001687#else // defined(REINTERPRET_INPUT_AS_3D)
1688 // Load values from matrix A and matrix B
1689 uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1690#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1691 uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1692#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1693#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1694 uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1695#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1696#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1697 uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1698#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1699#endif // defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001700
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001701 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1702 uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1703 uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1704 uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1705 src_addr.s1 += 4 * src1_stride_y;
1706
1707 ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1708 ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1709 ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1710 ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1711 ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1712 ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1713 ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1714 ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1715
Giorgio Arena6200fa42018-07-06 17:06:36 +01001716#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001717 ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1718 ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1719 ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1720 ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1721 ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1722 ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1723 ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1724 ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001725#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1726#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001727 ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1728 ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1729 ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1730 ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1731 ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1732 ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1733 ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1734 ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001735#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1736#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001737 ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1738 ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1739 ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1740 ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1741 ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1742 ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1743 ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1744 ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001745#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001746
1747 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1748 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1749 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1750 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1751 src_addr.s1 += 4 * src1_stride_y;
1752
1753 ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
1754 ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
1755 ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
1756 ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
1757 ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
1758 ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
1759 ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
1760 ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
1761
1762#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1763 ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
1764 ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
1765 ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
1766 ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
1767 ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
1768 ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
1769 ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
1770 ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
1771#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1772#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1773 ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
1774 ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
1775 ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
1776 ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
1777 ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
1778 ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
1779 ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
1780 ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
1781#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1782#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1783 ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
1784 ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
1785 ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
1786 ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
1787 ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
1788 ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
1789 ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
1790 ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
1791#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1792
1793 src_addr.s0 += 8;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001794 }
1795
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001796 for(; i < (int)COLS_A; ++i)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001797 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001798#if defined(REINTERPRET_INPUT_AS_3D)
Giorgio Arena6200fa42018-07-06 17:06:36 +01001799 // Load values from matrix A
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001800 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001801#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001802 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001803#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1804#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001805 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001806#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1807#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001808 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001809#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001810#else // defined(REINTERPRET_INPUT_AS_3D)
1811 // Load values from matrix A
1812 uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
1813#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1814 uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
1815#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1816#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1817 uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
1818#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1819#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1820 uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
1821#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1822#endif // defined(REINTERPRET_INPUT_AS_3D)
1823
Giorgio Arena6200fa42018-07-06 17:06:36 +01001824 // Load values from matrix B
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001825 uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);
1826 src_addr.s1 += src1_stride_y;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001827
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001828 acc00 += (uint)a0 * b0.s0;
1829 acc01 += (uint)a0 * b0.s1;
1830 acc02 += (uint)a0 * b0.s2;
1831 acc03 += (uint)a0 * b0.s3;
1832 acc04 += (uint)a0 * b0.s4;
1833 acc05 += (uint)a0 * b0.s5;
1834 acc06 += (uint)a0 * b0.s6;
1835 acc07 += (uint)a0 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001836
Giorgio Arena6200fa42018-07-06 17:06:36 +01001837#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001838 acc10 += (uint)a1 * b0.s0;
1839 acc11 += (uint)a1 * b0.s1;
1840 acc12 += (uint)a1 * b0.s2;
1841 acc13 += (uint)a1 * b0.s3;
1842 acc14 += (uint)a1 * b0.s4;
1843 acc15 += (uint)a1 * b0.s5;
1844 acc16 += (uint)a1 * b0.s6;
1845 acc17 += (uint)a1 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001846#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1847#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001848 acc20 += (uint)a2 * b0.s0;
1849 acc21 += (uint)a2 * b0.s1;
1850 acc22 += (uint)a2 * b0.s2;
1851 acc23 += (uint)a2 * b0.s3;
1852 acc24 += (uint)a2 * b0.s4;
1853 acc25 += (uint)a2 * b0.s5;
1854 acc26 += (uint)a2 * b0.s6;
1855 acc27 += (uint)a2 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001856#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1857#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001858 acc30 += (uint)a3 * b0.s0;
1859 acc31 += (uint)a3 * b0.s1;
1860 acc32 += (uint)a3 * b0.s2;
1861 acc33 += (uint)a3 * b0.s3;
1862 acc34 += (uint)a3 * b0.s4;
1863 acc35 += (uint)a3 * b0.s5;
1864 acc36 += (uint)a3 * b0.s6;
1865 acc37 += (uint)a3 * b0.s7;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001866#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Giorgio Arena6200fa42018-07-06 17:06:36 +01001867
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001868 src_addr.s0 += 1;
Giorgio Arena6200fa42018-07-06 17:06:36 +01001869 }
1870
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001871 int z = get_global_id(2);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001872
Giorgio Arena6200fa42018-07-06 17:06:36 +01001873 // Compute destination address
1874 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1875
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001876 // Compute dst address
1877 __global uchar *dst_addr = dst.ptr;
1878
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001879#if defined(REINTERPRET_OUTPUT_AS_3D)
1880 // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
1881 // in order to take into account the presence of possible cross plane paddings
1882 //
1883 // | |
1884 // | plane0 |
1885 // | |
1886 // |__________________|
1887 // |******************|
1888 // | cross_plane_pad |
1889 // |******************|
1890 // | |
1891 // | plane1 |
1892 // | |
1893 // |__________________|
1894
1895 // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001896 uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001897 zout = min(DEPTH_GEMM3D - 1, zout);
1898
1899 // Add offset due to the cross plane paddings
1900 zout *= (dst_cross_plane_pad * dst_stride_y);
1901
1902 // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
1903 // multiply dst_stride_z by DEPTH_GEMM3D
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001904 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001905
Giorgio Arena6200fa42018-07-06 17:06:36 +01001906 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001907 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
1908 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001909#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001910 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
1911 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001912#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1913#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001914 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
1915 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001916#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1917#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001918 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
1919 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
Giorgio Arena6200fa42018-07-06 17:06:36 +01001920#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001921
1922#else // defined(REINTERPRET_OUTPUT_AS_3D)
1923 // Add offset for batched GEMM
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001924 dst_addr += z * dst_stride_z;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001925
1926 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001927 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));
1928 vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001929#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001930 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));
1931 vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001932#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1933#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001934 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));
1935 vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001936#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1937#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001938 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
1939 vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01001940#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco Iodice4b908652018-10-18 10:21:02 +01001941#endif // defined(REINTERPRET_OUTPUT_AS_3D)
1942}
Georgios Pinitasdaa38552018-08-28 17:43:18 +01001943#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00001944#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
1945
1946#if defined(COLS_A)
1947/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
1948 *
1949 * @note This stage is needed to handle the offset of matrix product
1950 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1951 *
1952 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1953 *
1954 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1955 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1956 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1957 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1958 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1959 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1960 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1961 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1962 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1963 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1964 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1965 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1966 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1967 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1968 */
1969__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
1970 IMAGE_DECLARATION(dst))
1971{
1972 // Compute source and destination addresses
1973 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1974 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1975
1976 uint4 sum_row_u32 = (uint4)0;
1977 uint sum_row = 0;
1978
1979 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1980
1981 int i = 0;
1982
1983 // This for loop performs 16 accumulations
1984 for(; i <= ((int)COLS_A - 16); i += 16)
1985 {
1986 const uchar16 a0_u8 = vload16(0, matrix_a + i);
1987
1988 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
1989 }
1990
1991 // This for loop performs the leftover accumulations
1992 for(; i < COLS_A; ++i)
1993 {
1994 sum_row += matrix_a[i];
1995 }
1996
1997 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
1998
1999 *((__global int *)dst.ptr) = (int)sum_row;
2000}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002001
2002#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
2003/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
2004 *
2005 * @note This stage is needed to handle the offset of matrix product
2006 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
2007 *
2008 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
2009 *
2010 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
2011 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2012 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2013 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2014 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2015 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2016 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2017 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2018 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
2019 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2020 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2021 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2022 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2023 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2024 */
2025__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
2026 IMAGE_DECLARATION(dst))
2027{
2028 // Compute source and destination addresses
2029 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
2030 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2031
2032 uint sum_row = 0;
2033
2034 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
2035
2036 int i = 0;
2037
2038 // This for loop performs 16 accumulations
2039 for(; i <= ((int)COLS_A - 32); i += 32)
2040 {
2041 uchar16 a0_u8 = vload16(0, matrix_a + i);
2042
2043 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
2044 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
2045 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
2046 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
2047
2048 a0_u8 = vload16(1, matrix_a + i);
2049
2050 sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
2051 sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
2052 sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
2053 sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
2054 }
2055
2056 // This for loop performs the leftover accumulations
2057 for(; i < COLS_A; ++i)
2058 {
2059 sum_row += matrix_a[i];
2060 }
2061
2062 *((__global int *)dst.ptr) = (int)sum_row;
2063}
2064#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Gian Marco05288a22017-11-21 10:57:50 +00002065#endif // defined(COLS_A)
2066
2067#if defined(COLS_B) && defined(ROWS_B)
2068/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
2069 *
2070 * @note This stage is needed to handle the offset of matrix product
2071 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
2072 *
2073 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
2074 *
2075 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
2076 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2077 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2078 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2079 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2080 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2081 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2082 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2083 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
2084 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2085 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2086 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2087 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2088 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2089 */
2090__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
2091 IMAGE_DECLARATION(dst))
2092{
2093 // Compute source and destination addresses
2094 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
2095 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
2096
2097 uint16 sum_col_u32 = (uint16)0;
2098
2099 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
2100
2101 int i = 0;
2102 // This for loop performs 4 accumulations
2103 for(; i <= ((int)ROWS_B - 4); i += 4)
2104 {
2105 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
2106 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
2107 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
2108 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
2109
2110 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
2111
2112 matrix_b += 4 * src_stride_y;
2113 }
2114
2115 // This for loop perfoms the leftover accumulations
2116 for(; i < (int)ROWS_B; ++i)
2117 {
2118 const uchar16 b0_u8 = vload16(0, matrix_b);
2119
2120 sum_col_u32 += convert_uint16(b0_u8);
2121
2122 matrix_b += src_stride_y;
2123 }
2124
2125 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
2126}
2127#endif // defined(COLS_B) && defined(ROWS_B)
2128
2129#if defined(K_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002130
2131/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.
2132 *
2133 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2134 * and calculates the offset contribution of matrix A and matrix B.
2135 *
2136 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2137 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2138 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2139 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2140 *
2141 * @param[in] x get_global_id(0) * 4
2142 * @param[in] y get_global_id(1)
2143 * @param[in] z get_global_id(2)
2144 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2145 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2146 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2147 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2148 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2149 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2150 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2151 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2152 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2153 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2154 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2155 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2156 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2157 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2158 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2159 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2160 */
2161inline int4 offset_contribution(
2162 int x,
2163 int y,
2164 int z
2165#if defined(A_OFFSET)
2166 ,
2167 IMAGE_DECLARATION(sum_col)
2168#endif // defined(A_OFFSET)
2169#if defined(B_OFFSET)
2170 ,
2171 IMAGE_DECLARATION(sum_row)
2172#endif // defined(B_OFFSET)
2173#if defined(ADD_BIAS)
2174 ,
2175 VECTOR_DECLARATION(biases)
2176#endif // defined(ADD_BIAS)
2177)
2178{
2179 int4 a_offset_s32 = (int4)0;
2180 int4 b_offset_s32 = (int4)0;
2181
2182 int batch_id = z;
2183#if defined(DEPTH_INPUT3D)
2184 batch_id /= (int)DEPTH_INPUT3D;
2185#endif // defined(DEPTH_INPUT3D)
2186
2187#if defined(A_OFFSET)
2188 // Compute the offset contribution due to A_OFFSET
2189 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
2190
2191 // Compute the offset contribution due to A_OFFSET
2192#if defined(SUM_COL_HAS_BATCHES)
2193 a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
2194#else // defined(SUM_COL_HAS_BATCHES)
2195 a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
2196#endif // defined(SUM_COL_HAS_BATCHES)
2197
2198 a_offset_s32 *= (int4)A_OFFSET;
2199#endif // defined(A_OFFSET)
2200
2201#if defined(B_OFFSET)
2202 // Compute the offset contribution due to A_OFFSET
2203 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
2204
2205 // Compute the offset contribution due to B_OFFSET
2206#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2207 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
2208#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2209 b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
2210#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
2211 b_offset_s32 *= (int4)B_OFFSET;
2212#endif // defined(B_OFFSET)
2213
2214#if defined(ADD_BIAS)
2215 // Add bias
2216 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2217
2218 int4 biases_values = vload4(0, (__global int *)bias_addr);
2219 b_offset_s32 += (int4)biases_values;
2220#endif // defined(ADD_BIAS)
2221
2222 return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
2223}
2224
Gian Marco05288a22017-11-21 10:57:50 +00002225/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
2226 *
2227 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
2228 * and adds to it the offset contribution of matrix A and matrix B in-place.
2229 *
2230 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2231 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2232 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07002233 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00002234 *
2235 * The final result is:
2236 *
2237 * mm_result[i][k] = mm_result[i][k] +
2238 * (sum_col[k] * A_OFFSET) +
2239 * (sum_row[i] * B_OFFSET) +
2240 * (K_OFFSET)
2241 *
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002242 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2243 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2244 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2245 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2246 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2247 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2248 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2249 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002250 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2251 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2252 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2253 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2254 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2255 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2256 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2257 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2258 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2259 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2260 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2261 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2262 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2263 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2264 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2265 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002266 */
2267__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
2268#if defined(A_OFFSET)
2269 ,
2270 IMAGE_DECLARATION(sum_col)
2271#endif // defined(A_OFFSET)
2272#if defined(B_OFFSET)
2273 ,
2274 IMAGE_DECLARATION(sum_row)
2275#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002276#if defined(ADD_BIAS)
2277 ,
2278 VECTOR_DECLARATION(biases)
2279#endif // defined(ADD_BIAS))
Gian Marco05288a22017-11-21 10:57:50 +00002280 )
2281{
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002282 const int x = get_global_id(0) * 4;
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +01002283 const int y = get_global_id(1);
2284 const int z = get_global_id(2);
2285
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002286 // Compute offset contribution
2287 int4 offset_term_s32 = offset_contribution(
2288 x, y, z
Gian Marco05288a22017-11-21 10:57:50 +00002289#if defined(A_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002290 ,
2291 sum_col_ptr,
2292 sum_col_stride_x,
2293 sum_col_step_x,
2294 sum_col_stride_y,
2295 sum_col_step_y,
2296 sum_col_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002297#endif // defined(A_OFFSET)
Gian Marco05288a22017-11-21 10:57:50 +00002298#if defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002299 ,
2300 sum_row_ptr,
2301 sum_row_stride_x,
2302 sum_row_step_x,
2303 sum_row_stride_y,
2304 sum_row_step_y,
2305 sum_row_offset_first_element_in_bytes
Gian Marco05288a22017-11-21 10:57:50 +00002306#endif // defined(B_OFFSET)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002307#if defined(ADD_BIAS)
2308 ,
2309 biases_ptr,
2310 biases_stride_x,
2311 biases_step_x,
2312 biases_offset_first_element_in_bytes
2313#endif // defined(ADD_BIAS)
2314 );
Gian Marco05288a22017-11-21 10:57:50 +00002315
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002316 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002317
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002318 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002319
2320 // Add the offset terms to GEMM's result
2321 in_s32 += offset_term_s32;
2322
2323 // Store the result with the offset contribution
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002324 vstore4(in_s32, 0, (__global int *)mm_result_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002325}
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002326
2327#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
2328/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2329 *
2330 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2331 *
2332 *
2333 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2334 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2335 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2336 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2337 *
2338 * The result before the output stage is:
2339 *
2340 * mm_result[i][k] = mm_result[i][k] +
2341 * (sum_col[k] * A_OFFSET) +
2342 * (sum_row[i] * B_OFFSET) +
2343 * (K_OFFSET)
2344 *
2345 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2346 *
2347 * -# Add offset terms to final result
2348 * -# Multiply each entry of result by result_mult_int
2349 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2350 * -# Shift the int32 accumulator by result_shift
2351 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2352 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2353 *
2354 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2355 *
2356 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2357 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2358 * These values can be used to implement "rectified linear unit" activation functions
2359 *
2360 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2361 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2362 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2363 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2364 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2365 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2366 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2367 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2368 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2369 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2370 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2371 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2372 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2373 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2374 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2375 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2376 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2377 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2378 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2379 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2380 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2381 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2382 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2383 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2384 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2385 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2386 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2387 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2388 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2389 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2390 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2391 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2392 */
2393__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
2394#if defined(A_OFFSET)
2395 ,
2396 IMAGE_DECLARATION(sum_col)
2397#endif // defined(A_OFFSET)
2398#if defined(B_OFFSET)
2399 ,
2400 IMAGE_DECLARATION(sum_row)
2401#endif // defined(B_OFFSET)
2402 ,
2403#if defined(ADD_BIAS)
2404 VECTOR_DECLARATION(biases),
2405#endif // defined(ADD_BIAS)
2406 TENSOR3D_DECLARATION(dst))
2407{
2408 const int x = get_global_id(0) * 4;
2409 const int y = get_global_id(1);
2410 const int z = get_global_id(2);
2411
2412 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2413
2414 // Compute offset contribution
2415 int4 offset_term_s32 = offset_contribution(
2416 x, y, z
2417#if defined(A_OFFSET)
2418 ,
2419 sum_col_ptr,
2420 sum_col_stride_x,
2421 sum_col_step_x,
2422 sum_col_stride_y,
2423 sum_col_step_y,
2424 sum_col_offset_first_element_in_bytes
2425#endif // defined(A_OFFSET)
2426#if defined(B_OFFSET)
2427 ,
2428 sum_row_ptr,
2429 sum_row_stride_x,
2430 sum_row_step_x,
2431 sum_row_stride_y,
2432 sum_row_step_y,
2433 sum_row_offset_first_element_in_bytes
2434#endif // defined(B_OFFSET)
2435#if defined(ADD_BIAS)
2436 ,
2437 biases_ptr,
2438 biases_stride_x,
2439 biases_step_x,
2440 biases_offset_first_element_in_bytes
2441#endif // defined(ADD_BIAS)
2442 );
2443
2444 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2445
2446 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2447
2448 // Add the offset terms to GEMM's result
2449 in_s32 += offset_term_s32;
2450
2451 // -------------- OUTPUT STAGE
2452
2453 // Add the offset terms to GEMM's result
2454 in_s32 += (int4)RESULT_OFFSET;
2455
2456 // Multiply by result_mult_int and shift
2457 in_s32 *= RESULT_MULTIPLIER;
2458
2459 in_s32 >>= RESULT_SHIFT;
2460
2461 uchar4 res = convert_uchar4_sat(in_s32);
2462
2463#if defined(MIN_BOUND)
2464 res = max(res, (uchar4)MIN_BOUND);
2465#endif // defined(MIN_BOUND)
2466#if defined(MAX_BOUND)
2467 res = min(res, (uchar4)MAX_BOUND);
2468#endif // defined(MAX_BOUND)
2469
2470 // Store the result
2471 vstore4(res, 0, dst_addr);
2472}
2473
2474/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
2475 *
2476 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
2477 *
2478 *
2479 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
2480 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
2481 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
2482 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
2483 *
2484 * The result before the output stage is:
2485 *
2486 * mm_result[i][k] = mm_result[i][k] +
2487 * (sum_col[k] * A_OFFSET) +
2488 * (sum_row[i] * B_OFFSET) +
2489 * (K_OFFSET)
2490 *
2491 * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
2492 *
2493 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2494 * -# Add bias to final result if bias tensor is not a nullptr
2495 * -# Round to nearest division by a power-of-two using result_shift
2496 * -# Add offset to each result
2497 * -# Clamp the value between the specified min and max bounds
2498 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2499 *
2500 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2501 *
2502 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2503 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2504 * These values can be used to implement "rectified linear unit" activation functions
2505 *
2506 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
2507 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
2508 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
2509 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
2510 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
2511 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
2512 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
2513 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
2514 * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2515 * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2516 * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
2517 * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2518 * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
2519 * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2520 * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
2521 * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)
2522 * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
2523 * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)
2524 * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
2525 * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
2526 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2527 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2528 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2529 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
2530 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2531 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2532 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2533 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2534 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2535 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2536 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2537 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2538 */
2539__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
2540#if defined(A_OFFSET)
2541 ,
2542 IMAGE_DECLARATION(sum_col)
2543#endif // defined(A_OFFSET)
2544#if defined(B_OFFSET)
2545 ,
2546 IMAGE_DECLARATION(sum_row)
2547#endif // defined(B_OFFSET)
2548 ,
2549#if defined(ADD_BIAS)
2550 VECTOR_DECLARATION(biases),
2551#endif // defined(ADD_BIAS)
2552 TENSOR3D_DECLARATION(dst))
2553{
2554 const int x = get_global_id(0) * 4;
2555 const int y = get_global_id(1);
2556 const int z = get_global_id(2);
2557
2558 // Compute offset contribution
2559 int4 offset_term_s32 = offset_contribution(
2560 x, y, z
2561#if defined(A_OFFSET)
2562 ,
2563 sum_col_ptr,
2564 sum_col_stride_x,
2565 sum_col_step_x,
2566 sum_col_stride_y,
2567 sum_col_step_y,
2568 sum_col_offset_first_element_in_bytes
2569#endif // defined(A_OFFSET)
2570#if defined(B_OFFSET)
2571 ,
2572 sum_row_ptr,
2573 sum_row_stride_x,
2574 sum_row_step_x,
2575 sum_row_stride_y,
2576 sum_row_step_y,
2577 sum_row_offset_first_element_in_bytes
2578#endif // defined(B_OFFSET)
2579#if defined(ADD_BIAS)
2580 ,
2581 biases_ptr,
2582 biases_stride_x,
2583 biases_step_x,
2584 biases_offset_first_element_in_bytes
2585#endif // defined(ADD_BIAS)
2586 );
2587
2588 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
2589
2590 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2591
2592 int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
2593
2594 // Add the offset terms to GEMM's result
2595 in_s32 += offset_term_s32;
2596
2597 // -------------- OUTPUT STAGE
2598
2599 // Multiply by result_mult_int and shift
2600 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
2601
2602 // Add the offset terms to GEMM's result
2603 in_s32 += (int4)RESULT_OFFSET;
2604
2605 uchar4 res = convert_uchar4_sat(in_s32);
2606
2607#if defined(MIN_BOUND)
2608 res = max(res, (uchar4)MIN_BOUND);
2609#endif // defined(MIN_BOUND)
2610#if defined(MAX_BOUND)
2611 res = min(res, (uchar4)MAX_BOUND);
2612#endif // defined(MAX_BOUND)
2613
2614 // Store the result
2615 vstore4(res, 0, dst_addr);
2616}
2617#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
Gian Marco05288a22017-11-21 10:57:50 +00002618#endif // defined(K_OFFSET)
2619
2620#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2621/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2622 *
2623 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
2624 * The following computations will be performed by the kernel:
2625 *
2626 * -# Add offset terms to final result
2627 * -# Multiply each entry of result by result_mult_int
2628 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
2629 * -# Shift the int32 accumulator by result_shift
2630 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
2631 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2632 *
2633 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
2634 *
2635 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2636 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2637 * These values can be used to implement "rectified linear unit" activation functions
2638 *
2639 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2640 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2641 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2642 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2643 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2644 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2645 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2646 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002647 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2648 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2649 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2650 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco05288a22017-11-21 10:57:50 +00002651 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2652 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2653 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2654 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2655 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2656 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2657 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2658 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2659 */
2660__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
2661#if defined(ADD_BIAS)
2662 VECTOR_DECLARATION(biases),
2663#endif // defined(ADD_BIAS)
2664 TENSOR3D_DECLARATION(dst))
2665{
2666 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002667 int x = get_global_id(0) * 4;
2668 int y = get_global_id(1);
2669 int z = get_global_id(2);
Gian Marco05288a22017-11-21 10:57:50 +00002670
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002671 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco05288a22017-11-21 10:57:50 +00002672
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002673 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2674
2675 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002676
Gian Marco05288a22017-11-21 10:57:50 +00002677#if defined(ADD_BIAS)
2678 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002679 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2680
2681 int4 biases_values = vload4(0, (__global int *)bias_addr);
2682 input_values += (int4)biases_values;
Gian Marco05288a22017-11-21 10:57:50 +00002683#endif // defined(ADD_BIAS)
2684
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002685 // Add the offset terms to GEMM's result
2686 input_values += (int4)RESULT_OFFSET;
2687
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00002688 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00002689 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00002690
Gian Marco58c57942017-11-28 09:10:03 +00002691 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +00002692
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002693 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco05288a22017-11-21 10:57:50 +00002694
2695#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002696 res = max(res, (uchar4)MIN_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002697#endif // defined(MIN_BOUND)
2698#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002699 res = min(res, (uchar4)MAX_BOUND);
Gian Marco05288a22017-11-21 10:57:50 +00002700#endif // defined(MAX_BOUND)
2701
2702 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002703 vstore4(res, 0, dst_addr);
Gian Marco05288a22017-11-21 10:57:50 +00002704}
Gian Marco58c57942017-11-28 09:10:03 +00002705#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
2706
2707#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
2708/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2709 *
2710 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2711 * The following computations will be performed by the kernel:
2712 *
2713 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2714 * -# Add bias to final result if bias tensor is not a nullptr
2715 * -# Round to nearest division by a power-of-two using result_shift
2716 * -# Add offset to each result
2717 * -# Clamp the value between the specified min and max bounds
2718 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2719 *
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002720 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
Gian Marco58c57942017-11-28 09:10:03 +00002721 *
2722 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2723 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2724 * These values can be used to implement "rectified linear unit" activation functions
2725 *
2726 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2727 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2728 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2729 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2730 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2731 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2732 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2733 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002734 * @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
2735 * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)
2736 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
2737 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
Gian Marco58c57942017-11-28 09:10:03 +00002738 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2739 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2740 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2741 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2742 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2743 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2744 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2745 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2746 */
2747__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
2748#if defined(ADD_BIAS)
2749 VECTOR_DECLARATION(biases),
2750#endif // defined(ADD_BIAS)
2751 TENSOR3D_DECLARATION(dst))
2752{
2753 // Compute source and destination addresses
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002754 int x = get_global_id(0) * 4;
2755 int y = get_global_id(1);
2756 int z = get_global_id(2);
Georgios Pinitas932491f2018-09-21 16:33:15 +01002757
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002758 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Gian Marco58c57942017-11-28 09:10:03 +00002759
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002760 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2761
2762 int4 input_values = vload4(0, (__global int *)src_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002763
2764#if defined(ADD_BIAS)
2765 // Add bias
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002766 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2767
2768 int4 biases_values = vload4(0, (__global int *)bias_addr);
2769 input_values += (int4)biases_values;
Gian Marco58c57942017-11-28 09:10:03 +00002770#endif // defined(ADD_BIAS)
2771
2772 // Multiply by result_mult_int and shift
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002773 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
Gian Marco58c57942017-11-28 09:10:03 +00002774
2775 // Add the offset terms to GEMM's result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002776 input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
Gian Marco58c57942017-11-28 09:10:03 +00002777
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002778 uchar4 res = convert_uchar4_sat(input_values);
Gian Marco58c57942017-11-28 09:10:03 +00002779
2780#if defined(MIN_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002781 res = max(res, (uchar4)MIN_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002782#endif // defined(MIN_BOUND)
2783#if defined(MAX_BOUND)
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002784 res = min(res, (uchar4)MAX_BOUND);
Gian Marco58c57942017-11-28 09:10:03 +00002785#endif // defined(MAX_BOUND)
2786
2787 // Store the result
Gian Marco Iodice4b908652018-10-18 10:21:02 +01002788 vstore4(res, 0, dst_addr);
Gian Marco58c57942017-11-28 09:10:03 +00002789}
Chunosov5124be52017-11-22 20:42:13 +07002790#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002791
2792#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
2793/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
2794 *
2795 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
2796 * The following computations will be performed by the kernel:
2797 *
2798 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
2799 * -# Add bias to final result if bias tensor is not a nullptr
2800 * -# Requantize
2801 * -# Add offset to each result
2802 * -# Clamp the value between the specified min and max bounds
2803 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
2804 *
2805 * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
2806 *
2807 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
2808 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
2809 * These values can be used to implement "rectified linear unit" activation functions
2810 *
2811 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
2812 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
2813 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
2814 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
2815 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
2816 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
2817 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2818 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
2819 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
2820 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
2821 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
2822 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
2823 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
2824 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
2825 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
2826 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
2827 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
2828 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
2829 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
2830 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
2831 * @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
2832 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
2833 */
2834__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
2835#if defined(ADD_BIAS)
2836 VECTOR_DECLARATION(biases),
2837#endif // defined(ADD_BIAS)
2838#if defined(DST_HEIGHT)
2839 TENSOR4D_DECLARATION(dst))
2840#else // defined(DST_HEIGHT)
2841 TENSOR3D_DECLARATION(dst))
2842#endif // defined(DST_HEIGHT)
2843{
2844 // Compute source and destination addresses
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002845 int x = get_global_id(0) * 4;
2846 int y = get_global_id(1);
2847 int z = get_global_id(2);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002848
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002849 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002850
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002851 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2852
2853 int4 input_values = vload4(0, (__global int *)src_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002854
2855#if defined(ADD_BIAS)
2856 // Add bias
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002857 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
2858
2859 int4 biases_values = vload4(0, (__global int *)bias_addr);
2860 input_values += (int4)biases_values;
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002861#endif // defined(ADD_BIAS)
2862
2863 // Convert to float
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002864 float16 input_values_f = convert_float4(input_values);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002865 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
2866
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002867 uchar4 res = convert_uchar4_sat(input_values_f);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002868
2869#if defined(MIN_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002870 res = max(res, (uchar4)MIN_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002871#endif // defined(MIN_BOUND)
2872#if defined(MAX_BOUND)
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002873 res = min(res, (uchar4)MAX_BOUND);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002874#endif // defined(MAX_BOUND)
2875
2876 // Store the result
Gian Marco Iodice0c54a622018-10-30 12:20:03 +00002877 vstore4(res, 0, dst_addr);
Georgios Pinitas51e53a32018-10-22 13:49:08 +01002878}
2879#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)