blob: d724600cdd37132dbc6fcf59600ce33d87683918 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Gian Marco7b4d5472018-01-10 15:56:30 +00002 * Copyright (c) 2017-2018 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco05288a22017-11-21 10:57:50 +000026
27#if defined(COLS_B)
28/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
29 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
30 *
31 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
32 *
33 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
34 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
35 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
36 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
37 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
38 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
39 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
40 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
41 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
42 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
43 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
44 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
45 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
46 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
47 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
48 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
49 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
50 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
51 */
52__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
53 IMAGE_DECLARATION(src1),
54 IMAGE_DECLARATION(dst))
55{
56 // src_addr.s0 = address of matrix A
57 // src_addr.s1 = address of matrix B
58 // Compute address for matrix A and B
59 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
60 (src1_stride_y));
61
62 // Add offset_first_element_in_bytes
63 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
64
65 // Compute end row address for matrix B
66 int end_row_mtx_b = src_addr.s1 + COLS_B;
67
68 // Reset accumulators
69 int16 c00 = 0;
70 int16 c10 = 0;
71 int16 c20 = 0;
72 int16 c30 = 0;
73
74 for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
75 {
76 // Load values from matrix A (interleaved) and matrix B (transposed)
77 int8 a0 = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
78 int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
79
80 c00 += (int16)a0.s0 * b0;
81 c10 += (int16)a0.s1 * b0;
82 c20 += (int16)a0.s2 * b0;
83 c30 += (int16)a0.s3 * b0;
84
85 int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
86
87 c00 += (int16)a0.s4 * b1;
88 c10 += (int16)a0.s5 * b1;
89 c20 += (int16)a0.s6 * b1;
90 c30 += (int16)a0.s7 * b1;
91 }
92
93 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
94 {
95 // Load values from matrix A (interleaved) and matrix B (transposed)
96 int4 a0 = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
97 int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
98
99 c00 += (int16)a0.s0 * b0;
100 c10 += (int16)a0.s1 * b0;
101 c20 += (int16)a0.s2 * b0;
102 c30 += (int16)a0.s3 * b0;
103 }
104
105 // Compute destination address
106 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
107
108 // Store 4x16 block
109 vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
110 vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
111 vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
112 vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
113}
114#endif // defined(COLS_B)
115
116#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
117#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
118#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
119#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
120/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
121 *
122 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
123 *
124 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
125 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
126 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
128 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
130 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
131 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
132 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
133 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
134 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
135 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
136 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
137 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
138 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
139 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
140 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
141 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
142 */
Gian Marco7b4d5472018-01-10 15:56:30 +0000143__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
144 IMAGE_DECLARATION(src1),
145 IMAGE_DECLARATION(dst))
Gian Marco05288a22017-11-21 10:57:50 +0000146{
147 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
148
149 // Compute starting address for matrix A and Matrix B
150 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
151
152 // Update address for the matrix A
153 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
154
155 // Update address for the matrix B
156 src_addr.s1 += idx;
157
158 int end_row_vec_a = src_addr.s0 + COLS_A;
159
160 VECTOR_UINT acc0 = 0;
161#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
162 VECTOR_UINT acc1 = 0;
163#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
164#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
165 VECTOR_UINT acc2 = 0;
166#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
167#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
168 VECTOR_UINT acc3 = 0;
169#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000170#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
171 VECTOR_UINT acc4 = 0;
172#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000173
174 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
175 {
176 // Load values from matrix A
177 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
178#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
179 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
180#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
181#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
182 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
183#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
184#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
185 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
186#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000187#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
188 uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
189#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000190 // Load values from matrix B
191 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
192 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
193
194 // Accumulate
195 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
196 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
197#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
198 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
199 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
200#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
201#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
202 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
203 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
204#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
205#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
206 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
207 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
208#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000209#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
210 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
211 acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
212#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000213 }
214
215 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
216 {
217 // Load values from matrix A
218 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
219#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
220 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
221#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
222#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
223 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
224#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
225#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
226 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
227#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000228#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
229 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
230#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000231 // Load values from matrix B
232 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
233
234 // Accumulate
235 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
236#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
237 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
238#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
239#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
240 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
241#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
242#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
243 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
244#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000245#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
246 acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
247#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000248 }
249
250 // Compute destination address
251 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
252
253 // Store the result
254 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
255 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
256#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
257 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
258 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
259#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
260#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
261 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
262 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
263#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
264#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
265 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
266 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
267#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
Gian Marco7b4d5472018-01-10 15:56:30 +0000268#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
269 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
270 (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
271#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
272}
273
274/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
275 *
276 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
277 *
278 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
279 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
280 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
281 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
282 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
283 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
284 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
285 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
286 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
287 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
288 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
289 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
290 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
291 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
292 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
293 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
294 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
295 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
296 */
297__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
298 IMAGE_DECLARATION(src1),
299 IMAGE_DECLARATION(dst))
300{
301 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
302
303 // Compute starting address for matrix A and Matrix B
304 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
305
306 // Update address for the matrix A
307 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
308
309 // Update address for the matrix B
310 src_addr.s1 += idx;
311
312 int end_row_vec_a = src_addr.s0 + COLS_A;
313
314 uint acc00 = 0;
315 uint acc01 = 0;
316 uint acc02 = 0;
317 uint acc03 = 0;
318#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
319 uint acc10 = 0;
320 uint acc11 = 0;
321 uint acc12 = 0;
322 uint acc13 = 0;
323#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
324#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
325 uint acc20 = 0;
326 uint acc21 = 0;
327 uint acc22 = 0;
328 uint acc23 = 0;
329#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
330#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
331 uint acc30 = 0;
332 uint acc31 = 0;
333 uint acc32 = 0;
334 uint acc33 = 0;
335#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
336#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
337 uint acc40 = 0;
338 uint acc41 = 0;
339 uint acc42 = 0;
340 uint acc43 = 0;
341#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
342
343 for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
344 {
345 // Load values from matrix A
346 uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
347#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
348 uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
349#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
350#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
351 uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
352#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
353#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
354 uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
355#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
356#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
357 uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
358#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
359 // Load values from matrix B
360 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
361 uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
362 uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
363 uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
364
365 {
366 // Accumulate
367 ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
368 ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
369 ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
370 ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
371
372 ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
373 ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
374 ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
375 ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
376
377 ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
378 ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
379 ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
380 ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
381
382 ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
383 ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
384 ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
385 ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
386
387 acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
388 acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
389 acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
390 acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
391 }
392#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
393 {
394 // Accumulate
395 ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
396 ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
397 ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
398 ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
399
400 ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
401 ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
402 ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
403 ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
404
405 ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
406 ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
407 ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
408 ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
409
410 ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
411 ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
412 ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
413 ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
414
415 acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
416 acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
417 acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
418 acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
419 }
420#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
421#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
422 {
423 // Accumulate
424 ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
425 ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
426 ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
427 ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
428
429 ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
430 ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
431 ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
432 ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
433
434 ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
435 ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
436 ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
437 ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
438
439 ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
440 ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
441 ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
442 ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
443
444 acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
445 acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
446 acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
447 acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
448 }
449#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
450#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
451 {
452 // Accumulate
453 ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
454 ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
455 ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
456 ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
457
458 ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
459 ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
460 ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
461 ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
462
463 ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
464 ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
465 ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
466 ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
467
468 ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
469 ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
470 ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
471 ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
472
473 acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
474 acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
475 acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
476 acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
477 }
478#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
479#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
480 {
481 // Accumulate
482 ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
483 ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
484 ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
485 ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
486
487 ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
488 ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
489 ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
490 ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
491
492 ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
493 ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
494 ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
495 ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
496
497 ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
498 ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
499 ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
500 ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
501
502 acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
503 acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
504 acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
505 acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
506 }
507#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
508 }
509
510 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
511 {
512 // Load values from matrix A
513 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
514#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
515 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
516#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
517#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
518 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
519#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
520#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
521 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
522#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
523#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
524 uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
525#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
526 // Load values from matrix B
527 uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
528
529 // Accumulate
530 {
531 // Accumulate
532 ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
533 ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
534 ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
535 ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
536
537 acc00 += ((uint)tmp0);
538 acc01 += ((uint)tmp1);
539 acc02 += ((uint)tmp2);
540 acc03 += ((uint)tmp3);
541 }
542#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
543 {
544 // Accumulate
545 ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
546 ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
547 ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
548 ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
549
550 acc10 += ((uint)tmp0);
551 acc11 += ((uint)tmp1);
552 acc12 += ((uint)tmp2);
553 acc13 += ((uint)tmp3);
554 }
555#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
556#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
557 {
558 // Accumulate
559 ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
560 ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
561 ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
562 ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
563
564 acc20 += ((uint)tmp0);
565 acc21 += ((uint)tmp1);
566 acc22 += ((uint)tmp2);
567 acc23 += ((uint)tmp3);
568 }
569#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
570#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
571 {
572 // Accumulate
573 ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
574 ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
575 ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
576 ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
577
578 acc30 += ((uint)tmp0);
579 acc31 += ((uint)tmp1);
580 acc32 += ((uint)tmp2);
581 acc33 += ((uint)tmp3);
582 }
583#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
584#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
585 {
586 // Accumulate
587 ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
588 ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
589 ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
590 ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
591
592 acc40 += ((uint)tmp0);
593 acc41 += ((uint)tmp1);
594 acc42 += ((uint)tmp2);
595 acc43 += ((uint)tmp3);
596 }
597#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
598 }
599
600 // Compute destination address
601 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
602
603 // Store the result
604 vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
605#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
606 vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
607#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
608#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
609 vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
610#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
611#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
612 vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
613#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
614#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
615 vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
616#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
Gian Marco05288a22017-11-21 10:57:50 +0000617}
618#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
619
620#if defined(COLS_A)
621/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
622 *
623 * @note This stage is needed to handle the offset of matrix product
624 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
625 *
626 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
627 *
628 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
629 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
630 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
631 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
632 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
633 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
634 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
635 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
636 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
637 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
638 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
639 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
640 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
641 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
642 */
643__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
644 IMAGE_DECLARATION(dst))
645{
646 // Compute source and destination addresses
647 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
648 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
649
650 uint4 sum_row_u32 = (uint4)0;
651 uint sum_row = 0;
652
653 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
654
655 int i = 0;
656
657 // This for loop performs 16 accumulations
658 for(; i <= ((int)COLS_A - 16); i += 16)
659 {
660 const uchar16 a0_u8 = vload16(0, matrix_a + i);
661
662 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
663 }
664
665 // This for loop performs the leftover accumulations
666 for(; i < COLS_A; ++i)
667 {
668 sum_row += matrix_a[i];
669 }
670
671 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
672
673 *((__global int *)dst.ptr) = (int)sum_row;
674}
675#endif // defined(COLS_A)
676
677#if defined(COLS_B) && defined(ROWS_B)
678/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
679 *
680 * @note This stage is needed to handle the offset of matrix product
681 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
682 *
683 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
684 *
685 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
686 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
687 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
688 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
689 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
690 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
691 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
692 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
693 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
694 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
695 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
696 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
697 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
698 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
699 */
700__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
701 IMAGE_DECLARATION(dst))
702{
703 // Compute source and destination addresses
704 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
705 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
706
707 uint16 sum_col_u32 = (uint16)0;
708
709 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
710
711 int i = 0;
712 // This for loop performs 4 accumulations
713 for(; i <= ((int)ROWS_B - 4); i += 4)
714 {
715 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
716 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
717 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
718 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
719
720 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
721
722 matrix_b += 4 * src_stride_y;
723 }
724
725 // This for loop perfoms the leftover accumulations
726 for(; i < (int)ROWS_B; ++i)
727 {
728 const uchar16 b0_u8 = vload16(0, matrix_b);
729
730 sum_col_u32 += convert_uint16(b0_u8);
731
732 matrix_b += src_stride_y;
733 }
734
735 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
736}
737#endif // defined(COLS_B) && defined(ROWS_B)
738
739#if defined(K_OFFSET)
740/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
741 *
742 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
743 * and adds to it the offset contribution of matrix A and matrix B in-place.
744 *
745 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
746 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
747 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +0700748 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +0000749 *
750 * The final result is:
751 *
752 * mm_result[i][k] = mm_result[i][k] +
753 * (sum_col[k] * A_OFFSET) +
754 * (sum_row[i] * B_OFFSET) +
755 * (K_OFFSET)
756 *
757 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
758 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
759 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
760 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
761 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
762 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
763 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
764 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
765 * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
766 * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
767 * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
768 * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
769 * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
770 * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
771 * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
772 * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
773 * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
774 * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
775 * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
776 * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
777 */
778__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
779#if defined(A_OFFSET)
780 ,
781 IMAGE_DECLARATION(sum_col)
782#endif // defined(A_OFFSET)
783#if defined(B_OFFSET)
784 ,
785 IMAGE_DECLARATION(sum_row)
786#endif // defined(B_OFFSET)
787 )
788{
789 Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
790
791 int16 a_offset_s32 = (int16)0;
792 int16 b_offset_s32 = (int16)0;
793
794#if defined(A_OFFSET)
795 Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
796
797 // Compute the offset contribution due to A_OFFSET
Chunosov5124be52017-11-22 20:42:13 +0700798#if defined(SUM_COL_HAS_BATCHES)
799 a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
800#else // defined(MATRIX_B_HAS_BATCHES)
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +0000801 a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));
Chunosov5124be52017-11-22 20:42:13 +0700802#endif // defined(MATRIX_B_HAS_BATCHES)
803
Gian Marco05288a22017-11-21 10:57:50 +0000804 a_offset_s32 *= (int16)A_OFFSET;
805#endif // defined(A_OFFSET)
806
807#if defined(B_OFFSET)
808 Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
809
810 // Compute the offset contribution due to B_OFFSET
811 b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
812 b_offset_s32 *= (int16)B_OFFSET;
813#endif // defined(B_OFFSET)
814
815 const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
816
817 int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
818
819 // Add the offset terms to GEMM's result
820 in_s32 += offset_term_s32;
821
822 // Store the result with the offset contribution
823 vstore16(in_s32, 0, (__global int *)mm_result.ptr);
824}
825#endif // defined(K_OFFSET)
826
827#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
828/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
829 *
830 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
831 * The following computations will be performed by the kernel:
832 *
833 * -# Add offset terms to final result
834 * -# Multiply each entry of result by result_mult_int
835 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
836 * -# Shift the int32 accumulator by result_shift
837 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
838 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
839 *
840 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
841 *
842 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
843 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
844 * These values can be used to implement "rectified linear unit" activation functions
845 *
846 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
847 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
848 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
849 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
850 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
851 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
852 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
853 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
854 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
855 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
856 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
857 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
858 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
859 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
860 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
861 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
862 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
863 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
864 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
865 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
866 */
867__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
868#if defined(ADD_BIAS)
869 VECTOR_DECLARATION(biases),
870#endif // defined(ADD_BIAS)
871 TENSOR3D_DECLARATION(dst))
872{
873 // Compute source and destination addresses
874 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
875 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
876#if defined(ADD_BIAS)
877 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
878#endif // defined(ADD_BIAS)
879
880 int16 input_values = vload16(0, (__global int *)src.ptr);
881
Gian Marco58c57942017-11-28 09:10:03 +0000882 // Add the offset terms to GEMM's result
883 input_values += (int16)RESULT_OFFSET;
884
Gian Marco05288a22017-11-21 10:57:50 +0000885#if defined(ADD_BIAS)
886 // Add bias
887 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
888 input_values += (int16)biases_values;
889#endif // defined(ADD_BIAS)
890
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +0000891 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +0000892 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +0000893
Gian Marco58c57942017-11-28 09:10:03 +0000894 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +0000895
896 uchar16 res = convert_uchar16_sat(input_values);
897
898#if defined(MIN_BOUND)
899 res = max(res, (uchar16)MIN_BOUND);
900#endif // defined(MIN_BOUND)
901#if defined(MAX_BOUND)
902 res = min(res, (uchar16)MAX_BOUND);
903#endif // defined(MAX_BOUND)
904
905 // Store the result
906 vstore16(res, 0, dst.ptr);
907}
Gian Marco58c57942017-11-28 09:10:03 +0000908#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
909
910#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
911/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
912 *
913 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
914 * The following computations will be performed by the kernel:
915 *
916 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
917 * -# Add bias to final result if bias tensor is not a nullptr
918 * -# Round to nearest division by a power-of-two using result_shift
919 * -# Add offset to each result
920 * -# Clamp the value between the specified min and max bounds
921 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
922 *
923 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
924 *
925 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
926 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
927 * These values can be used to implement "rectified linear unit" activation functions
928 *
929 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
930 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
931 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
932 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
933 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
934 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
935 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
936 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
937 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
938 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
939 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
940 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
941 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
942 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
943 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
944 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
945 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
946 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
947 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
948 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
949 */
950__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
951#if defined(ADD_BIAS)
952 VECTOR_DECLARATION(biases),
953#endif // defined(ADD_BIAS)
954 TENSOR3D_DECLARATION(dst))
955{
956 // Compute source and destination addresses
957 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
958 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
959#if defined(ADD_BIAS)
960 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
961#endif // defined(ADD_BIAS)
962
963 int16 input_values = vload16(0, (__global int *)src.ptr);
964
965#if defined(ADD_BIAS)
966 // Add bias
967 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
968 input_values += (int16)biases_values;
969#endif // defined(ADD_BIAS)
970
971 // Multiply by result_mult_int and shift
972 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
973
974 // Add the offset terms to GEMM's result
975 input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
976
977 uchar16 res = convert_uchar16_sat(input_values);
978
979#if defined(MIN_BOUND)
980 res = max(res, (uchar16)MIN_BOUND);
981#endif // defined(MIN_BOUND)
982#if defined(MAX_BOUND)
983 res = min(res, (uchar16)MAX_BOUND);
984#endif // defined(MAX_BOUND)
985
986 // Store the result
987 vstore16(res, 0, dst.ptr);
988}
Chunosov5124be52017-11-22 20:42:13 +0700989#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)