blob: cd8b269ae211560f25905c3451a1e7496a78363b [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco7b4d5472018-01-10 15:56:30 +00002 * Copyright (c) 2017-2018 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco05288a22017-11-21 10:57:50 +000026
Giorgio Arenac50da382018-07-26 15:50:09 +010027#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
28#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
29#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
30#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
31#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
32#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
33#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
34
Gian Marco19835e52018-01-30 13:35:54 +000035#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +000036/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
Gian Marco19835e52018-01-30 13:35:54 +000037 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
Gian Marco05288a22017-11-21 10:57:50 +000038 *
Gian Marco19835e52018-01-30 13:35:54 +000039 * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
40 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
41 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
Gian Marco05288a22017-11-21 10:57:50 +000042 *
43 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
44 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
45 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
47 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
49 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
50 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
51 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
52 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
53 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
54 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
55 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
56 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
57 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
58 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
59 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
60 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
61 */
Gian Marco19835e52018-01-30 13:35:54 +000062__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
63 IMAGE_DECLARATION(src1),
64 IMAGE_DECLARATION(dst))
Gian Marco05288a22017-11-21 10:57:50 +000065{
Gian Marco19835e52018-01-30 13:35:54 +000066 int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
67 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
Gian Marco05288a22017-11-21 10:57:50 +000068
Gian Marco19835e52018-01-30 13:35:54 +000069 // Offset
70 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
71 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
72
73 // src_addr_a = address of matrix A
74 // src_addr_b = address of matrix B
75 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
76 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
Gian Marco05288a22017-11-21 10:57:50 +000077
78 // Compute end row address for matrix B
Gian Marco19835e52018-01-30 13:35:54 +000079 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
80
81 src_addr_a += offset_row_a;
82 src_addr_b += offset_row_b;
Gian Marco05288a22017-11-21 10:57:50 +000083
84 // Reset accumulators
Gian Marco19835e52018-01-30 13:35:54 +000085 int4 c00 = 0;
86 int4 c10 = 0;
87 int4 c20 = 0;
88 int4 c30 = 0;
Gian Marco05288a22017-11-21 10:57:50 +000089
Gian Marco19835e52018-01-30 13:35:54 +000090 for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +000091 {
92 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +000093 int4 a0 = convert_int4(vload4(0, src_addr_a));
94 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +000095
Gian Marco19835e52018-01-30 13:35:54 +000096 c00 += (int4)a0.s0 * b0;
97 c10 += (int4)a0.s1 * b0;
98 c20 += (int4)a0.s2 * b0;
99 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000100
Gian Marco19835e52018-01-30 13:35:54 +0000101 a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
102 b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
Gian Marco05288a22017-11-21 10:57:50 +0000103
Gian Marco19835e52018-01-30 13:35:54 +0000104 c00 += (int4)a0.s0 * b0;
105 c10 += (int4)a0.s1 * b0;
106 c20 += (int4)a0.s2 * b0;
107 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000108 }
109
Gian Marco19835e52018-01-30 13:35:54 +0000110 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
Gian Marco05288a22017-11-21 10:57:50 +0000111 {
112 // Load values from matrix A (interleaved) and matrix B (transposed)
Gian Marco19835e52018-01-30 13:35:54 +0000113 int4 a0 = convert_int4(vload4(0, src_addr_a));
114 int4 b0 = convert_int4(vload4(0, src_addr_b));
Gian Marco05288a22017-11-21 10:57:50 +0000115
Gian Marco19835e52018-01-30 13:35:54 +0000116 c00 += (int4)a0.s0 * b0;
117 c10 += (int4)a0.s1 * b0;
118 c20 += (int4)a0.s2 * b0;
119 c30 += (int4)a0.s3 * b0;
Gian Marco05288a22017-11-21 10:57:50 +0000120 }
121
122 // Compute destination address
123 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
124
Gian Marco19835e52018-01-30 13:35:54 +0000125 // Store 4x4 block
126 vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));
127 vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));
128 vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));
129 vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));
Gian Marco05288a22017-11-21 10:57:50 +0000130}
Gian Marco19835e52018-01-30 13:35:54 +0000131
132/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
133 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
134 *
135 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
136 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
137 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
138 *
139 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
140 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
141 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
142 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
143 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
144 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
145 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
146 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
147 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
148 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
149 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
150 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
151 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
152 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
153 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
154 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
155 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
156 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
157 */
158__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
159 IMAGE_DECLARATION(src1),
160 IMAGE_DECLARATION(dst))
161{
162 int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
163 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
164
165 // Offset
166 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
167 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
168
169 // src_addr_a = address of matrix A
170 // src_addr_b = address of matrix B
171 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
172 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
173
174 // Compute end row address for matrix B
175 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
176
177 src_addr_a += offset_row_a;
178 src_addr_b += offset_row_b;
179
180 // Reset accumulators
181 uint c00 = 0;
182 uint c01 = 0;
183 uint c02 = 0;
184 uint c03 = 0;
185 uint c10 = 0;
186 uint c11 = 0;
187 uint c12 = 0;
188 uint c13 = 0;
189 uint c20 = 0;
190 uint c21 = 0;
191 uint c22 = 0;
192 uint c23 = 0;
193 uint c30 = 0;
194 uint c31 = 0;
195 uint c32 = 0;
196 uint c33 = 0;
197
198#if MULT_INTERLEAVE4X4_HEIGHT == 1
199 for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
200 {
201 // Load values from matrix A (interleaved) and matrix B (transposed)
202 uchar16 a0 = vload16(0, src_addr_a);
203 uchar4 b0 = vload4(0, src_addr_b);
204
205 c00 += (ushort)a0.s0 * b0.s0;
206 c01 += (ushort)a0.s0 * b0.s1;
207 c02 += (ushort)a0.s0 * b0.s2;
208 c03 += (ushort)a0.s0 * b0.s3;
209
210 c10 += (ushort)a0.s1 * b0.s0;
211 c11 += (ushort)a0.s1 * b0.s1;
212 c12 += (ushort)a0.s1 * b0.s2;
213 c13 += (ushort)a0.s1 * b0.s3;
214
215 c20 += (ushort)a0.s2 * b0.s0;
216 c21 += (ushort)a0.s2 * b0.s1;
217 c22 += (ushort)a0.s2 * b0.s2;
218 c23 += (ushort)a0.s2 * b0.s3;
219
220 c30 += (ushort)a0.s3 * b0.s0;
221 c31 += (ushort)a0.s3 * b0.s1;
222 c32 += (ushort)a0.s3 * b0.s2;
223 c33 += (ushort)a0.s3 * b0.s3;
224
225 // Load values from matrix B (transposed)
226 b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
227
228 c00 += (ushort)a0.s4 * b0.s0;
229 c01 += (ushort)a0.s4 * b0.s1;
230 c02 += (ushort)a0.s4 * b0.s2;
231 c03 += (ushort)a0.s4 * b0.s3;
232
233 c10 += (ushort)a0.s5 * b0.s0;
234 c11 += (ushort)a0.s5 * b0.s1;
235 c12 += (ushort)a0.s5 * b0.s2;
236 c13 += (ushort)a0.s5 * b0.s3;
237
238 c20 += (ushort)a0.s6 * b0.s0;
239 c21 += (ushort)a0.s6 * b0.s1;
240 c22 += (ushort)a0.s6 * b0.s2;
241 c23 += (ushort)a0.s6 * b0.s3;
242
243 c30 += (ushort)a0.s7 * b0.s0;
244 c31 += (ushort)a0.s7 * b0.s1;
245 c32 += (ushort)a0.s7 * b0.s2;
246 c33 += (ushort)a0.s7 * b0.s3;
247
248 // Load values from matrix B (transposed)
249 b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
250
251 c00 += (ushort)a0.s8 * b0.s0;
252 c01 += (ushort)a0.s8 * b0.s1;
253 c02 += (ushort)a0.s8 * b0.s2;
254 c03 += (ushort)a0.s8 * b0.s3;
255
256 c10 += (ushort)a0.s9 * b0.s0;
257 c11 += (ushort)a0.s9 * b0.s1;
258 c12 += (ushort)a0.s9 * b0.s2;
259 c13 += (ushort)a0.s9 * b0.s3;
260
261 c20 += (ushort)a0.sA * b0.s0;
262 c21 += (ushort)a0.sA * b0.s1;
263 c22 += (ushort)a0.sA * b0.s2;
264 c23 += (ushort)a0.sA * b0.s3;
265
266 c30 += (ushort)a0.sB * b0.s0;
267 c31 += (ushort)a0.sB * b0.s1;
268 c32 += (ushort)a0.sB * b0.s2;
269 c33 += (ushort)a0.sB * b0.s3;
270
271 // Load values from matrix B (transposed)
272 b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
273
274 c00 += (ushort)a0.sC * b0.s0;
275 c01 += (ushort)a0.sC * b0.s1;
276 c02 += (ushort)a0.sC * b0.s2;
277 c03 += (ushort)a0.sC * b0.s3;
278
279 c10 += (ushort)a0.sD * b0.s0;
280 c11 += (ushort)a0.sD * b0.s1;
281 c12 += (ushort)a0.sD * b0.s2;
282 c13 += (ushort)a0.sD * b0.s3;
283
284 c20 += (ushort)a0.sE * b0.s0;
285 c21 += (ushort)a0.sE * b0.s1;
286 c22 += (ushort)a0.sE * b0.s2;
287 c23 += (ushort)a0.sE * b0.s3;
288
289 c30 += (ushort)a0.sF * b0.s0;
290 c31 += (ushort)a0.sF * b0.s1;
291 c32 += (ushort)a0.sF * b0.s2;
292 c33 += (ushort)a0.sF * b0.s3;
293
294 // Load values from matrix A (interleaved) and matrix B (transposed)
295 a0 = vload16(0, src_addr_a + 16);
296 b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
297
298 c00 += (ushort)a0.s0 * b0.s0;
299 c01 += (ushort)a0.s0 * b0.s1;
300 c02 += (ushort)a0.s0 * b0.s2;
301 c03 += (ushort)a0.s0 * b0.s3;
302
303 c10 += (ushort)a0.s1 * b0.s0;
304 c11 += (ushort)a0.s1 * b0.s1;
305 c12 += (ushort)a0.s1 * b0.s2;
306 c13 += (ushort)a0.s1 * b0.s3;
307
308 c20 += (ushort)a0.s2 * b0.s0;
309 c21 += (ushort)a0.s2 * b0.s1;
310 c22 += (ushort)a0.s2 * b0.s2;
311 c23 += (ushort)a0.s2 * b0.s3;
312
313 c30 += (ushort)a0.s3 * b0.s0;
314 c31 += (ushort)a0.s3 * b0.s1;
315 c32 += (ushort)a0.s3 * b0.s2;
316 c33 += (ushort)a0.s3 * b0.s3;
317
318 // Load values from matrix B (transposed)
319 b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
320
321 c00 += (ushort)a0.s4 * b0.s0;
322 c01 += (ushort)a0.s4 * b0.s1;
323 c02 += (ushort)a0.s4 * b0.s2;
324 c03 += (ushort)a0.s4 * b0.s3;
325
326 c10 += (ushort)a0.s5 * b0.s0;
327 c11 += (ushort)a0.s5 * b0.s1;
328 c12 += (ushort)a0.s5 * b0.s2;
329 c13 += (ushort)a0.s5 * b0.s3;
330
331 c20 += (ushort)a0.s6 * b0.s0;
332 c21 += (ushort)a0.s6 * b0.s1;
333 c22 += (ushort)a0.s6 * b0.s2;
334 c23 += (ushort)a0.s6 * b0.s3;
335
336 c30 += (ushort)a0.s7 * b0.s0;
337 c31 += (ushort)a0.s7 * b0.s1;
338 c32 += (ushort)a0.s7 * b0.s2;
339 c33 += (ushort)a0.s7 * b0.s3;
340
341 // Load values from matrix B (transposed)
342 b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
343
344 c00 += (ushort)a0.s8 * b0.s0;
345 c01 += (ushort)a0.s8 * b0.s1;
346 c02 += (ushort)a0.s8 * b0.s2;
347 c03 += (ushort)a0.s8 * b0.s3;
348
349 c10 += (ushort)a0.s9 * b0.s0;
350 c11 += (ushort)a0.s9 * b0.s1;
351 c12 += (ushort)a0.s9 * b0.s2;
352 c13 += (ushort)a0.s9 * b0.s3;
353
354 c20 += (ushort)a0.sA * b0.s0;
355 c21 += (ushort)a0.sA * b0.s1;
356 c22 += (ushort)a0.sA * b0.s2;
357 c23 += (ushort)a0.sA * b0.s3;
358
359 c30 += (ushort)a0.sB * b0.s0;
360 c31 += (ushort)a0.sB * b0.s1;
361 c32 += (ushort)a0.sB * b0.s2;
362 c33 += (ushort)a0.sB * b0.s3;
363
364 // Load values from matrix B (transposed)
365 b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
366
367 c00 += (ushort)a0.sC * b0.s0;
368 c01 += (ushort)a0.sC * b0.s1;
369 c02 += (ushort)a0.sC * b0.s2;
370 c03 += (ushort)a0.sC * b0.s3;
371
372 c10 += (ushort)a0.sD * b0.s0;
373 c11 += (ushort)a0.sD * b0.s1;
374 c12 += (ushort)a0.sD * b0.s2;
375 c13 += (ushort)a0.sD * b0.s3;
376
377 c20 += (ushort)a0.sE * b0.s0;
378 c21 += (ushort)a0.sE * b0.s1;
379 c22 += (ushort)a0.sE * b0.s2;
380 c23 += (ushort)a0.sE * b0.s3;
381
382 c30 += (ushort)a0.sF * b0.s0;
383 c31 += (ushort)a0.sF * b0.s1;
384 c32 += (ushort)a0.sF * b0.s2;
385 c33 += (ushort)a0.sF * b0.s3;
386 }
387#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
388
389 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
390 {
391 // Load values from matrix A (interleaved) and matrix B (transposed)
392 uchar4 a0 = vload4(0, src_addr_a);
393 uchar4 b0 = vload4(0, src_addr_b);
394
395 c00 += (ushort)a0.s0 * b0.s0;
396 c01 += (ushort)a0.s0 * b0.s1;
397 c02 += (ushort)a0.s0 * b0.s2;
398 c03 += (ushort)a0.s0 * b0.s3;
399
400 c10 += (ushort)a0.s1 * b0.s0;
401 c11 += (ushort)a0.s1 * b0.s1;
402 c12 += (ushort)a0.s1 * b0.s2;
403 c13 += (ushort)a0.s1 * b0.s3;
404
405 c20 += (ushort)a0.s2 * b0.s0;
406 c21 += (ushort)a0.s2 * b0.s1;
407 c22 += (ushort)a0.s2 * b0.s2;
408 c23 += (ushort)a0.s2 * b0.s3;
409
410 c30 += (ushort)a0.s3 * b0.s0;
411 c31 += (ushort)a0.s3 * b0.s1;
412 c32 += (ushort)a0.s3 * b0.s2;
413 c33 += (ushort)a0.s3 * b0.s3;
414 }
415
416 // Compute destination address
417 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
418
419 // Store 4x4 block
420 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
421 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
422 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
423 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
424}
Giorgio Arena6200fa42018-07-06 17:06:36 +0100425
426#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
427/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
428 * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
429 *
430 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
431 * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
432 * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
433 *
434 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
435 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
436 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
437 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
438 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
439 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
440 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
441 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
442 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
443 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
444 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
445 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
446 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
447 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
448 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
449 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
450 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
451 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
452 */
453__kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),
454 IMAGE_DECLARATION(src1),
455 IMAGE_DECLARATION(dst))
456{
457 int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
458 int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
459
460 // Offset
461 const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
462 const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
463
464 // src_addr_a = address of matrix A
465 // src_addr_b = address of matrix B
466 __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
467 __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
468
469 // Compute end row address for matrix B
470 __global uchar *src_end_addr_b = src_addr_b + COLS_B;
471
472 src_addr_a += offset_row_a;
473 src_addr_b += offset_row_b;
474
475 // Reset accumulators
476 uint c00 = 0;
477 uint c01 = 0;
478 uint c02 = 0;
479 uint c03 = 0;
480 uint c10 = 0;
481 uint c11 = 0;
482 uint c12 = 0;
483 uint c13 = 0;
484 uint c20 = 0;
485 uint c21 = 0;
486 uint c22 = 0;
487 uint c23 = 0;
488 uint c30 = 0;
489 uint c31 = 0;
490 uint c32 = 0;
491 uint c33 = 0;
492
493#if MULT_INTERLEAVE4X4_HEIGHT == 1
494 for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
495 {
496 // Load values from matrix A (interleaved) and matrix B (transposed)
497 uchar16 a0 = vload16(0, src_addr_a);
498 uchar4 b0 = vload4(0, src_addr_b);
499 uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
500 uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
501 uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
502
503 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +0100504 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
505 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
506 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
507 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100508
Giorgio Arenac50da382018-07-26 15:50:09 +0100509 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
510 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
511 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
512 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100513
Giorgio Arenac50da382018-07-26 15:50:09 +0100514 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
515 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
516 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
517 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100518
Giorgio Arenac50da382018-07-26 15:50:09 +0100519 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
520 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
521 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
522 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100523
524 // Load values from matrix A (interleaved) and matrix B (transposed)
525 a0 = vload16(0, src_addr_a + 16);
526 b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
527 b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
528 b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
529 b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
530
531 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +0100532 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
533 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
534 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
535 ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100536
Giorgio Arenac50da382018-07-26 15:50:09 +0100537 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
538 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
539 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
540 ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100541
Giorgio Arenac50da382018-07-26 15:50:09 +0100542 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
543 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
544 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
545 ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100546
Giorgio Arenac50da382018-07-26 15:50:09 +0100547 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
548 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
549 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
550 ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
Giorgio Arena6200fa42018-07-06 17:06:36 +0100551 }
552#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
553
554 for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
555 {
556 // Load values from matrix A (interleaved) and matrix B (transposed)
557 uchar4 a0 = vload4(0, src_addr_a);
558 uchar4 b0 = vload4(0, src_addr_b);
559
560 c00 += (ushort)a0.s0 * b0.s0;
561 c01 += (ushort)a0.s0 * b0.s1;
562 c02 += (ushort)a0.s0 * b0.s2;
563 c03 += (ushort)a0.s0 * b0.s3;
564
565 c10 += (ushort)a0.s1 * b0.s0;
566 c11 += (ushort)a0.s1 * b0.s1;
567 c12 += (ushort)a0.s1 * b0.s2;
568 c13 += (ushort)a0.s1 * b0.s3;
569
570 c20 += (ushort)a0.s2 * b0.s0;
571 c21 += (ushort)a0.s2 * b0.s1;
572 c22 += (ushort)a0.s2 * b0.s2;
573 c23 += (ushort)a0.s2 * b0.s3;
574
575 c30 += (ushort)a0.s3 * b0.s0;
576 c31 += (ushort)a0.s3 * b0.s1;
577 c32 += (ushort)a0.s3 * b0.s2;
578 c33 += (ushort)a0.s3 * b0.s3;
579 }
580
581 // Compute destination address
582 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
583
584 // Store 4x4 block
585 vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
586 vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
587 vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
588 vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
589}
590#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
591
Gian Marco19835e52018-01-30 13:35:54 +0000592#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
Gian Marco05288a22017-11-21 10:57:50 +0000593
594#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
595#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
596#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
597#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
598/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
599 *
600 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
601 *
602 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
603 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
604 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
605 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
606 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
607 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
608 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
609 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
610 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
611 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
612 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
613 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
614 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
615 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
616 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
617 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
618 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
619 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
620 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000621__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
622 IMAGE_DECLARATION(src1),
623 IMAGE_DECLARATION(dst))
Gian Marco05288a22017-11-21 10:57:50 +0000624{
625 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
626
627 // Compute starting address for matrix A and Matrix B
628 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
629
630 // Update address for the matrix A
631 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
632
633 // Update address for the matrix B
634 src_addr.s1 += idx;
635
636 int end_row_vec_a = src_addr.s0 + COLS_A;
637
638 VECTOR_UINT acc0 = 0;
639#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
640 VECTOR_UINT acc1 = 0;
641#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
642#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
643 VECTOR_UINT acc2 = 0;
644#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
645#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
646 VECTOR_UINT acc3 = 0;
647#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000648#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
649 VECTOR_UINT acc4 = 0;
650#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000651
652 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
653 {
654 // Load values from matrix A
655 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
656#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
657 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
658#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
659#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
660 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
661#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
662#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
663 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
664#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000665#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
666 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
667#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000668 // Load values from matrix B
669 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
670 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
671
672 // Accumulate
673 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
674 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
675#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
676 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
677 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
678#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
679#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
680 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
681 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
682#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
683#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
684 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
685 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
686#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000687#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
688 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
689 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
690#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000691 }
692
693 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
694 {
695 // Load values from matrix A
696 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
697#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
698 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
699#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
700#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
701 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
702#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
703#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
704 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
705#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000706#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
707 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
708#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000709 // Load values from matrix B
710 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
711
712 // Accumulate
713 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
714#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
715 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
716#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
717#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
718 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
719#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
720#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
721 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
722#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000723#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
724 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
725#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000726 }
727
728 // Compute destination address
729 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
730
731 // Store the result
732 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
733 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
734#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
735 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
736 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
737#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
738#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
739 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
740 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
741#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
742#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
743 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
744 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
745#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000746#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
747 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
748 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
749#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
750}
751
752/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
753 *
754 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
755 *
756 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
757 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
758 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
759 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
760 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
761 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
762 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
763 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
764 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
765 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
766 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
767 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
768 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
769 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
770 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
771 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
772 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
773 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
774 */
775__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
776 IMAGE_DECLARATION(src1),
777 IMAGE_DECLARATION(dst))
778{
779 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
780
781 // Compute starting address for matrix A and Matrix B
782 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
783
784 // Update address for the matrix A
785 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
786
787 // Update address for the matrix B
788 src_addr.s1 += idx;
789
790 int end_row_vec_a = src_addr.s0 + COLS_A;
791
792 uint acc00 = 0;
793 uint acc01 = 0;
794 uint acc02 = 0;
795 uint acc03 = 0;
796#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
797 uint acc10 = 0;
798 uint acc11 = 0;
799 uint acc12 = 0;
800 uint acc13 = 0;
801#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
802#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
803 uint acc20 = 0;
804 uint acc21 = 0;
805 uint acc22 = 0;
806 uint acc23 = 0;
807#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
808#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
809 uint acc30 = 0;
810 uint acc31 = 0;
811 uint acc32 = 0;
812 uint acc33 = 0;
813#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
814#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
815 uint acc40 = 0;
816 uint acc41 = 0;
817 uint acc42 = 0;
818 uint acc43 = 0;
819#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
820
821 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
822 {
823 // Load values from matrix A
824 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
825#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
826 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
827#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
828#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
829 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
830#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
831#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
832 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
833#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
834#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
835 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
836#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
837 // Load values from matrix B
838 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
839 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
840 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
841 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
842
843 {
844 // Accumulate
845 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
846 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
847 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
848 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
849
850 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
851 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
852 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
853 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
854
855 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
856 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
857 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
858 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
859
860 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
861 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
862 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
863 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
864
865 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
866 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
867 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
868 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
869 }
870#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
871 {
872 // Accumulate
873 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
874 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
875 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
876 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
877
878 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
879 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
880 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
881 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
882
883 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
884 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
885 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
886 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
887
888 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
889 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
890 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
891 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
892
893 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
894 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
895 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
896 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
897 }
898#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
899#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
900 {
901 // Accumulate
902 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
903 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
904 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
905 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
906
907 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
908 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
909 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
910 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
911
912 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
913 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
914 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
915 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
916
917 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
918 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
919 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
920 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
921
922 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
923 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
924 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
925 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
926 }
927#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
928#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
929 {
930 // Accumulate
931 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
932 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
933 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
934 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
935
936 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
937 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
938 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
939 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
940
941 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
942 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
943 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
944 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
945
946 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
947 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
948 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
949 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
950
951 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
952 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
953 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
954 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
955 }
956#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
957#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
958 {
959 // Accumulate
960 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
961 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
962 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
963 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
964
965 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
966 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
967 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
968 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
969
970 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
971 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
972 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
973 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
974
975 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
976 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
977 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
978 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
979
980 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
981 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
982 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
983 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
984 }
985#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
986 }
987
988 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
989 {
990 // Load values from matrix A
991 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
992#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
993 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
994#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
995#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
996 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
997#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
998#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
999 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
1000#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1001#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1002 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
1003#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1004 // Load values from matrix B
1005 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
1006
1007 // Accumulate
1008 {
1009 // Accumulate
1010 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
1011 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
1012 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
1013 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
1014
1015 acc00 += ((uint)tmp0);
1016 acc01 += ((uint)tmp1);
1017 acc02 += ((uint)tmp2);
1018 acc03 += ((uint)tmp3);
1019 }
1020#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1021 {
1022 // Accumulate
1023 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
1024 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
1025 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
1026 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
1027
1028 acc10 += ((uint)tmp0);
1029 acc11 += ((uint)tmp1);
1030 acc12 += ((uint)tmp2);
1031 acc13 += ((uint)tmp3);
1032 }
1033#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1034#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1035 {
1036 // Accumulate
1037 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
1038 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
1039 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
1040 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
1041
1042 acc20 += ((uint)tmp0);
1043 acc21 += ((uint)tmp1);
1044 acc22 += ((uint)tmp2);
1045 acc23 += ((uint)tmp3);
1046 }
1047#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1048#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1049 {
1050 // Accumulate
1051 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
1052 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
1053 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
1054 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
1055
1056 acc30 += ((uint)tmp0);
1057 acc31 += ((uint)tmp1);
1058 acc32 += ((uint)tmp2);
1059 acc33 += ((uint)tmp3);
1060 }
1061#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1062#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1063 {
1064 // Accumulate
1065 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
1066 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
1067 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
1068 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
1069
1070 acc40 += ((uint)tmp0);
1071 acc41 += ((uint)tmp1);
1072 acc42 += ((uint)tmp2);
1073 acc43 += ((uint)tmp3);
1074 }
1075#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1076 }
1077
1078 // Compute destination address
1079 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1080
1081 // Store the result
1082 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
1083#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1084 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
1085#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1086#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1087 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
1088#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1089#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1090 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
1091#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1092#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1093 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
1094#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +00001095}
Giorgio Arena6200fa42018-07-06 17:06:36 +01001096
1097#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
1098/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
1099 *
1100 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1101 *
1102 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
1103 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
1104 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1105 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
1106 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1107 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
1108 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
1109 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
1110 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1111 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
1112 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1113 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
1114 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
1115 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
1116 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1117 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
1118 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1119 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
1120 */
1121__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
1122 IMAGE_DECLARATION(src1),
1123 IMAGE_DECLARATION(dst))
1124{
1125 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
1126
1127 // Compute starting address for matrix A and Matrix B
1128 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
1129
1130 // Update address for the matrix A
1131 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
1132
1133 // Update address for the matrix B
1134 src_addr.s1 += idx;
1135
1136 int end_row_vec_a = src_addr.s0 + COLS_A;
1137
1138 uint acc00 = 0;
1139 uint acc01 = 0;
1140 uint acc02 = 0;
1141 uint acc03 = 0;
1142#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1143 uint acc10 = 0;
1144 uint acc11 = 0;
1145 uint acc12 = 0;
1146 uint acc13 = 0;
1147#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1148#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1149 uint acc20 = 0;
1150 uint acc21 = 0;
1151 uint acc22 = 0;
1152 uint acc23 = 0;
1153#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1154#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1155 uint acc30 = 0;
1156 uint acc31 = 0;
1157 uint acc32 = 0;
1158 uint acc33 = 0;
1159#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1160#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1161 uint acc40 = 0;
1162 uint acc41 = 0;
1163 uint acc42 = 0;
1164 uint acc43 = 0;
1165#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1166
1167 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
1168 {
1169 // Load values from matrix A
1170 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
1171#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1172 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
1173#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1174#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1175 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
1176#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1177#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1178 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
1179#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1180#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1181 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
1182#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1183 // Load values from matrix B
1184 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
1185 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
1186 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
1187 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
1188
1189 {
1190 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +01001191 ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a0.s0, a0.s1, a0.s2, a0.s3, acc00);
1192 ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a0.s0, a0.s1, a0.s2, a0.s3, acc01);
1193 ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a0.s0, a0.s1, a0.s2, a0.s3, acc02);
1194 ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a0.s0, a0.s1, a0.s2, a0.s3, acc03);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001195 }
1196#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1197 {
1198 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +01001199 ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a1.s0, a1.s1, a1.s2, a1.s3, acc10);
1200 ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a1.s0, a1.s1, a1.s2, a1.s3, acc11);
1201 ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a1.s0, a1.s1, a1.s2, a1.s3, acc12);
1202 ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a1.s0, a1.s1, a1.s2, a1.s3, acc13);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001203 }
1204#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1205#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1206 {
1207 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +01001208 ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a2.s0, a2.s1, a2.s2, a2.s3, acc20);
1209 ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a2.s0, a2.s1, a2.s2, a2.s3, acc21);
1210 ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a2.s0, a2.s1, a2.s2, a2.s3, acc22);
1211 ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a2.s0, a2.s1, a2.s2, a2.s3, acc23);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001212 }
1213#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1214#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1215 {
1216 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +01001217 ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a3.s0, a3.s1, a3.s2, a3.s3, acc30);
1218 ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a3.s0, a3.s1, a3.s2, a3.s3, acc31);
1219 ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a3.s0, a3.s1, a3.s2, a3.s3, acc32);
1220 ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a3.s0, a3.s1, a3.s2, a3.s3, acc33);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001221 }
1222#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1223#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1224 {
1225 // Accumulate
Giorgio Arenac50da382018-07-26 15:50:09 +01001226 ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a4.s0, a4.s1, a4.s2, a4.s3, acc40);
1227 ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a4.s0, a4.s1, a4.s2, a4.s3, acc41);
1228 ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a4.s0, a4.s1, a4.s2, a4.s3, acc42);
1229 ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a4.s0, a4.s1, a4.s2, a4.s3, acc43);
Giorgio Arena6200fa42018-07-06 17:06:36 +01001230 }
1231#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1232 }
1233
1234 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
1235 {
1236 // Load values from matrix A
1237 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
1238#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1239 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
1240#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1241#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1242 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
1243#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1244#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1245 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
1246#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1247#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1248 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
1249#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1250 // Load values from matrix B
1251 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
1252
1253 // Accumulate
1254 {
1255 // Accumulate
1256 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
1257 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
1258 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
1259 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
1260
1261 acc00 += ((uint)tmp0);
1262 acc01 += ((uint)tmp1);
1263 acc02 += ((uint)tmp2);
1264 acc03 += ((uint)tmp3);
1265 }
1266#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1267 {
1268 // Accumulate
1269 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
1270 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
1271 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
1272 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
1273
1274 acc10 += ((uint)tmp0);
1275 acc11 += ((uint)tmp1);
1276 acc12 += ((uint)tmp2);
1277 acc13 += ((uint)tmp3);
1278 }
1279#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1280#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1281 {
1282 // Accumulate
1283 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
1284 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
1285 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
1286 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
1287
1288 acc20 += ((uint)tmp0);
1289 acc21 += ((uint)tmp1);
1290 acc22 += ((uint)tmp2);
1291 acc23 += ((uint)tmp3);
1292 }
1293#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1294#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1295 {
1296 // Accumulate
1297 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
1298 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
1299 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
1300 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
1301
1302 acc30 += ((uint)tmp0);
1303 acc31 += ((uint)tmp1);
1304 acc32 += ((uint)tmp2);
1305 acc33 += ((uint)tmp3);
1306 }
1307#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1308#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1309 {
1310 // Accumulate
1311 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
1312 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
1313 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
1314 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
1315
1316 acc40 += ((uint)tmp0);
1317 acc41 += ((uint)tmp1);
1318 acc42 += ((uint)tmp2);
1319 acc43 += ((uint)tmp3);
1320 }
1321#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1322 }
1323
1324 // Compute destination address
1325 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1326
1327 // Store the result
1328 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
1329#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1330 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
1331#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
1332#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1333 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
1334#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
1335#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1336 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
1337#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
1338#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1339 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
1340#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
1341}
1342#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
1343
Gian Marco05288a22017-11-21 10:57:50 +00001344#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
1345
1346#if defined(COLS_A)
1347/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
1348 *
1349 * @note This stage is needed to handle the offset of matrix product
1350 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1351 *
1352 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
1353 *
1354 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1355 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1356 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1357 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1358 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1359 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1360 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1361 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1362 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1363 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1364 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1365 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1366 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1367 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1368 */
1369__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
1370 IMAGE_DECLARATION(dst))
1371{
1372 // Compute source and destination addresses
1373 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1374 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1375
1376 uint4 sum_row_u32 = (uint4)0;
1377 uint sum_row = 0;
1378
1379 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1380
1381 int i = 0;
1382
1383 // This for loop performs 16 accumulations
1384 for(; i <= ((int)COLS_A - 16); i += 16)
1385 {
1386 const uchar16 a0_u8 = vload16(0, matrix_a + i);
1387
1388 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
1389 }
1390
1391 // This for loop performs the leftover accumulations
1392 for(; i < COLS_A; ++i)
1393 {
1394 sum_row += matrix_a[i];
1395 }
1396
1397 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
1398
1399 *((__global int *)dst.ptr) = (int)sum_row;
1400}
1401#endif // defined(COLS_A)
1402
1403#if defined(COLS_B) && defined(ROWS_B)
1404/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
1405 *
1406 * @note This stage is needed to handle the offset of matrix product
1407 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
1408 *
1409 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
1410 *
1411 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
1412 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1413 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1414 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1415 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1416 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1417 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1418 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1419 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
1420 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1421 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1422 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1423 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1424 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1425 */
1426__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
1427 IMAGE_DECLARATION(dst))
1428{
1429 // Compute source and destination addresses
1430 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1431 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
1432
1433 uint16 sum_col_u32 = (uint16)0;
1434
1435 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
1436
1437 int i = 0;
1438 // This for loop performs 4 accumulations
1439 for(; i <= ((int)ROWS_B - 4); i += 4)
1440 {
1441 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
1442 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
1443 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
1444 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
1445
1446 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
1447
1448 matrix_b += 4 * src_stride_y;
1449 }
1450
1451 // This for loop perfoms the leftover accumulations
1452 for(; i < (int)ROWS_B; ++i)
1453 {
1454 const uchar16 b0_u8 = vload16(0, matrix_b);
1455
1456 sum_col_u32 += convert_uint16(b0_u8);
1457
1458 matrix_b += src_stride_y;
1459 }
1460
1461 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
1462}
1463#endif // defined(COLS_B) && defined(ROWS_B)
1464
1465#if defined(K_OFFSET)
1466/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
1467 *
1468 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
1469 * and adds to it the offset contribution of matrix A and matrix B in-place.
1470 *
1471 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
1472 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
1473 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +07001474 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +00001475 *
1476 * The final result is:
1477 *
1478 * mm_result[i][k] = mm_result[i][k] +
1479 * (sum_col[k] * A_OFFSET) +
1480 * (sum_row[i] * B_OFFSET) +
1481 * (K_OFFSET)
1482 *
1483 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
1484 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
1485 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
1486 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1487 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
1488 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
1489 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
1490 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1491 * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1492 * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
1493 * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
1494 * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1495 * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
1496 * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1497 * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
1498 * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
1499 * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
1500 * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
1501 * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
1502 * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
1503 */
1504__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
1505#if defined(A_OFFSET)
1506 ,
1507 IMAGE_DECLARATION(sum_col)
1508#endif // defined(A_OFFSET)
1509#if defined(B_OFFSET)
1510 ,
1511 IMAGE_DECLARATION(sum_row)
1512#endif // defined(B_OFFSET)
1513 )
1514{
1515 Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
1516
Gian Marco19835e52018-01-30 13:35:54 +00001517 int4 a_offset_s32 = (int4)0;
1518 int4 b_offset_s32 = (int4)0;
Gian Marco05288a22017-11-21 10:57:50 +00001519
1520#if defined(A_OFFSET)
1521 Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
1522
1523 // Compute the offset contribution due to A_OFFSET
Chunosov5124be52017-11-22 20:42:13 +07001524#if defined(SUM_COL_HAS_BATCHES)
Gian Marco19835e52018-01-30 13:35:54 +00001525 a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
Chunosov5124be52017-11-22 20:42:13 +07001526#else // defined(MATRIX_B_HAS_BATCHES)
Gian Marco19835e52018-01-30 13:35:54 +00001527 a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));
Chunosov5124be52017-11-22 20:42:13 +07001528#endif // defined(MATRIX_B_HAS_BATCHES)
1529
Gian Marco19835e52018-01-30 13:35:54 +00001530 a_offset_s32 *= (int4)A_OFFSET;
Gian Marco05288a22017-11-21 10:57:50 +00001531#endif // defined(A_OFFSET)
1532
1533#if defined(B_OFFSET)
1534 Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
1535
1536 // Compute the offset contribution due to B_OFFSET
Gian Marco19835e52018-01-30 13:35:54 +00001537 b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
1538 b_offset_s32 *= (int4)B_OFFSET;
Gian Marco05288a22017-11-21 10:57:50 +00001539#endif // defined(B_OFFSET)
1540
Gian Marco19835e52018-01-30 13:35:54 +00001541 const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
Gian Marco05288a22017-11-21 10:57:50 +00001542
Gian Marco19835e52018-01-30 13:35:54 +00001543 int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);
Gian Marco05288a22017-11-21 10:57:50 +00001544
1545 // Add the offset terms to GEMM's result
1546 in_s32 += offset_term_s32;
1547
1548 // Store the result with the offset contribution
Gian Marco19835e52018-01-30 13:35:54 +00001549 vstore4(in_s32, 0, (__global int *)mm_result.ptr);
Gian Marco05288a22017-11-21 10:57:50 +00001550}
1551#endif // defined(K_OFFSET)
1552
1553#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1554/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1555 *
1556 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
1557 * The following computations will be performed by the kernel:
1558 *
1559 * -# Add offset terms to final result
1560 * -# Multiply each entry of result by result_mult_int
1561 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
1562 * -# Shift the int32 accumulator by result_shift
1563 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
1564 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1565 *
1566 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1567 *
1568 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1569 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1570 * These values can be used to implement "rectified linear unit" activation functions
1571 *
1572 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1573 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1574 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1575 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1576 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1577 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1578 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1579 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1580 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
1581 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
1582 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
1583 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
1584 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1585 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1586 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1587 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1588 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1589 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1590 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1591 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1592 */
1593__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
1594#if defined(ADD_BIAS)
1595 VECTOR_DECLARATION(biases),
1596#endif // defined(ADD_BIAS)
1597 TENSOR3D_DECLARATION(dst))
1598{
1599 // Compute source and destination addresses
1600 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1601 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
1602#if defined(ADD_BIAS)
1603 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1604#endif // defined(ADD_BIAS)
1605
1606 int16 input_values = vload16(0, (__global int *)src.ptr);
1607
Gian Marco58c57942017-11-28 09:10:03 +00001608 // Add the offset terms to GEMM's result
1609 input_values += (int16)RESULT_OFFSET;
1610
Gian Marco05288a22017-11-21 10:57:50 +00001611#if defined(ADD_BIAS)
1612 // Add bias
1613 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
1614 input_values += (int16)biases_values;
1615#endif // defined(ADD_BIAS)
1616
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +00001617 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +00001618 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +00001619
Gian Marco58c57942017-11-28 09:10:03 +00001620 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +00001621
1622 uchar16 res = convert_uchar16_sat(input_values);
1623
1624#if defined(MIN_BOUND)
1625 res = max(res, (uchar16)MIN_BOUND);
1626#endif // defined(MIN_BOUND)
1627#if defined(MAX_BOUND)
1628 res = min(res, (uchar16)MAX_BOUND);
1629#endif // defined(MAX_BOUND)
1630
1631 // Store the result
1632 vstore16(res, 0, dst.ptr);
1633}
Gian Marco58c57942017-11-28 09:10:03 +00001634#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
1635
1636#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
1637/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
1638 *
1639 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
1640 * The following computations will be performed by the kernel:
1641 *
1642 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
1643 * -# Add bias to final result if bias tensor is not a nullptr
1644 * -# Round to nearest division by a power-of-two using result_shift
1645 * -# Add offset to each result
1646 * -# Clamp the value between the specified min and max bounds
1647 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
1648 *
1649 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
1650 *
1651 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
1652 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
1653 * These values can be used to implement "rectified linear unit" activation functions
1654 *
1655 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
1656 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
1657 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
1658 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
1659 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
1660 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
1661 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1662 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
1663 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
1664 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
1665 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
1666 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
1667 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
1668 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
1669 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
1670 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
1671 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
1672 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
1673 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
1674 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1675 */
1676__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
1677#if defined(ADD_BIAS)
1678 VECTOR_DECLARATION(biases),
1679#endif // defined(ADD_BIAS)
1680 TENSOR3D_DECLARATION(dst))
1681{
1682 // Compute source and destination addresses
1683 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
1684 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
1685#if defined(ADD_BIAS)
1686 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
1687#endif // defined(ADD_BIAS)
1688
1689 int16 input_values = vload16(0, (__global int *)src.ptr);
1690
1691#if defined(ADD_BIAS)
1692 // Add bias
1693 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
1694 input_values += (int16)biases_values;
1695#endif // defined(ADD_BIAS)
1696
1697 // Multiply by result_mult_int and shift
1698 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
1699
1700 // Add the offset terms to GEMM's result
1701 input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
1702
1703 uchar16 res = convert_uchar16_sat(input_values);
1704
1705#if defined(MIN_BOUND)
1706 res = max(res, (uchar16)MIN_BOUND);
1707#endif // defined(MIN_BOUND)
1708#if defined(MAX_BOUND)
1709 res = min(res, (uchar16)MAX_BOUND);
1710#endif // defined(MAX_BOUND)
1711
1712 // Store the result
1713 vstore16(res, 0, dst.ptr);
1714}
Chunosov5124be52017-11-22 20:42:13 +07001715#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)