blob: a92881320e3ee491d4677b48c7e279750287ef4d [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +000025#include "helpers_asymm.h"
Gian Marco05288a22017-11-21 10:57:50 +000026
27#if defined(COLS_B)
28/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
29 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
30 *
31 * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
32 *
33 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
34 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
35 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
36 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
37 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
38 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
39 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
40 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
41 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
42 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
43 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
44 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
45 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
46 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
47 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
48 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
49 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
50 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
51 */
52__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
53 IMAGE_DECLARATION(src1),
54 IMAGE_DECLARATION(dst))
55{
56 // src_addr.s0 = address of matrix A
57 // src_addr.s1 = address of matrix B
58 // Compute address for matrix A and B
59 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
60 (src1_stride_y));
61
62 // Add offset_first_element_in_bytes
63 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
64
65 // Compute end row address for matrix B
66 int end_row_mtx_b = src_addr.s1 + COLS_B;
67
68 // Reset accumulators
69 int16 c00 = 0;
70 int16 c10 = 0;
71 int16 c20 = 0;
72 int16 c30 = 0;
73
74 for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
75 {
76 // Load values from matrix A (interleaved) and matrix B (transposed)
77 int8 a0 = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
78 int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
79
80 c00 += (int16)a0.s0 * b0;
81 c10 += (int16)a0.s1 * b0;
82 c20 += (int16)a0.s2 * b0;
83 c30 += (int16)a0.s3 * b0;
84
85 int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
86
87 c00 += (int16)a0.s4 * b1;
88 c10 += (int16)a0.s5 * b1;
89 c20 += (int16)a0.s6 * b1;
90 c30 += (int16)a0.s7 * b1;
91 }
92
93 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
94 {
95 // Load values from matrix A (interleaved) and matrix B (transposed)
96 int4 a0 = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
97 int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
98
99 c00 += (int16)a0.s0 * b0;
100 c10 += (int16)a0.s1 * b0;
101 c20 += (int16)a0.s2 * b0;
102 c30 += (int16)a0.s3 * b0;
103 }
104
105 // Compute destination address
106 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
107
108 // Store 4x16 block
109 vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
110 vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
111 vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
112 vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
113}
114#endif // defined(COLS_B)
115
116#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
117#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
118#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
119#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
120/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
121 *
122 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
123 *
124 * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
125 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
126 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
128 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
130 * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
131 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
132 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
133 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
134 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
135 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
136 * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
137 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
138 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
139 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
140 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
141 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
142 */
143__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0),
144 IMAGE_DECLARATION(src1),
145 IMAGE_DECLARATION(dst))
146{
147 int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
148
149 // Compute starting address for matrix A and Matrix B
150 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
151
152 // Update address for the matrix A
153 src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
154
155 // Update address for the matrix B
156 src_addr.s1 += idx;
157
158 int end_row_vec_a = src_addr.s0 + COLS_A;
159
160 VECTOR_UINT acc0 = 0;
161#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
162 VECTOR_UINT acc1 = 0;
163#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
164#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
165 VECTOR_UINT acc2 = 0;
166#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
167#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
168 VECTOR_UINT acc3 = 0;
169#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
170
171 for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
172 {
173 // Load values from matrix A
174 uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
175#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
176 uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
177#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
178#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
179 uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
180#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
181#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
182 uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
183#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
184 // Load values from matrix B
185 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
186 VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
187
188 // Accumulate
189 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
190 acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
191#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
192 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
193 acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
194#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
195#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
196 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
197 acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
198#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
199#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
200 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
201 acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
202#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
203 }
204
205 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
206 {
207 // Load values from matrix A
208 uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
209#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
210 uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
211#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
212#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
213 uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
214#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
215#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
216 uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
217#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
218 // Load values from matrix B
219 VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
220
221 // Accumulate
222 acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
223#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
224 acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
225#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
226#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
227 acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
228#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
229#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
230 acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
231#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
232 }
233
234 // Compute destination address
235 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
236
237 // Store the result
238 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
239 (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
240#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
241 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
242 (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
243#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
244#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
245 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
246 (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
247#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
248#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
249 VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
250 (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
251#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
252}
253#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
254
255#if defined(COLS_A)
256/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
257 *
258 * @note This stage is needed to handle the offset of matrix product
259 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
260 *
261 * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
262 *
263 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
264 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
265 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
266 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
267 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
268 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
269 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
270 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
271 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
272 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
273 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
274 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
275 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
276 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
277 */
278__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
279 IMAGE_DECLARATION(dst))
280{
281 // Compute source and destination addresses
282 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
283 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
284
285 uint4 sum_row_u32 = (uint4)0;
286 uint sum_row = 0;
287
288 __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
289
290 int i = 0;
291
292 // This for loop performs 16 accumulations
293 for(; i <= ((int)COLS_A - 16); i += 16)
294 {
295 const uchar16 a0_u8 = vload16(0, matrix_a + i);
296
297 sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
298 }
299
300 // This for loop performs the leftover accumulations
301 for(; i < COLS_A; ++i)
302 {
303 sum_row += matrix_a[i];
304 }
305
306 sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
307
308 *((__global int *)dst.ptr) = (int)sum_row;
309}
310#endif // defined(COLS_A)
311
312#if defined(COLS_B) && defined(ROWS_B)
313/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
314 *
315 * @note This stage is needed to handle the offset of matrix product
316 * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
317 *
318 * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
319 *
320 * @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8
321 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
322 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
323 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
324 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
325 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
326 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
327 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
328 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32
329 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
330 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
331 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
332 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
333 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
334 */
335__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
336 IMAGE_DECLARATION(dst))
337{
338 // Compute source and destination addresses
339 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
340 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
341
342 uint16 sum_col_u32 = (uint16)0;
343
344 __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
345
346 int i = 0;
347 // This for loop performs 4 accumulations
348 for(; i <= ((int)ROWS_B - 4); i += 4)
349 {
350 const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
351 const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
352 const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
353 const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
354
355 sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
356
357 matrix_b += 4 * src_stride_y;
358 }
359
360 // This for loop perfoms the leftover accumulations
361 for(; i < (int)ROWS_B; ++i)
362 {
363 const uchar16 b0_u8 = vload16(0, matrix_b);
364
365 sum_col_u32 += convert_uint16(b0_u8);
366
367 matrix_b += src_stride_y;
368 }
369
370 vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
371}
372#endif // defined(COLS_B) && defined(ROWS_B)
373
374#if defined(K_OFFSET)
375/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
376 *
377 * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
378 * and adds to it the offset contribution of matrix A and matrix B in-place.
379 *
380 * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
381 * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
382 * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
Chunosov5124be52017-11-22 20:42:13 +0700383 * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
Gian Marco05288a22017-11-21 10:57:50 +0000384 *
385 * The final result is:
386 *
387 * mm_result[i][k] = mm_result[i][k] +
388 * (sum_col[k] * A_OFFSET) +
389 * (sum_row[i] * B_OFFSET) +
390 * (K_OFFSET)
391 *
392 * @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32
393 * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)
394 * @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)
395 * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)
396 * @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
397 * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)
398 * @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
399 * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
400 * @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
401 * @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)
402 * @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)
403 * @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)
404 * @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
405 * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
406 * @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
407 * @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)
408 * @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)
409 * @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)
410 * @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
411 * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
412 */
413__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
414#if defined(A_OFFSET)
415 ,
416 IMAGE_DECLARATION(sum_col)
417#endif // defined(A_OFFSET)
418#if defined(B_OFFSET)
419 ,
420 IMAGE_DECLARATION(sum_row)
421#endif // defined(B_OFFSET)
422 )
423{
424 Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
425
426 int16 a_offset_s32 = (int16)0;
427 int16 b_offset_s32 = (int16)0;
428
429#if defined(A_OFFSET)
430 Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
431
432 // Compute the offset contribution due to A_OFFSET
Chunosov5124be52017-11-22 20:42:13 +0700433#if defined(SUM_COL_HAS_BATCHES)
434 a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
435#else // defined(MATRIX_B_HAS_BATCHES)
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +0000436 a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));
Chunosov5124be52017-11-22 20:42:13 +0700437#endif // defined(MATRIX_B_HAS_BATCHES)
438
Gian Marco05288a22017-11-21 10:57:50 +0000439 a_offset_s32 *= (int16)A_OFFSET;
440#endif // defined(A_OFFSET)
441
442#if defined(B_OFFSET)
443 Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
444
445 // Compute the offset contribution due to B_OFFSET
446 b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
447 b_offset_s32 *= (int16)B_OFFSET;
448#endif // defined(B_OFFSET)
449
450 const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
451
452 int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
453
454 // Add the offset terms to GEMM's result
455 in_s32 += offset_term_s32;
456
457 // Store the result with the offset contribution
458 vstore16(in_s32, 0, (__global int *)mm_result.ptr);
459}
460#endif // defined(K_OFFSET)
461
462#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
463/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
464 *
465 * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
466 * The following computations will be performed by the kernel:
467 *
468 * -# Add offset terms to final result
469 * -# Multiply each entry of result by result_mult_int
470 * -# Add bias to final result (if -DADD_BIAS is passed at compile time)
471 * -# Shift the int32 accumulator by result_shift
472 * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
473 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
474 *
475 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
476 *
477 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
478 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
479 * These values can be used to implement "rectified linear unit" activation functions
480 *
481 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
482 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
483 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
484 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
485 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
486 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
487 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
488 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
489 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
490 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
491 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
492 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
493 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
494 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
495 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
496 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
497 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
498 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
499 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
500 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
501 */
502__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
503#if defined(ADD_BIAS)
504 VECTOR_DECLARATION(biases),
505#endif // defined(ADD_BIAS)
506 TENSOR3D_DECLARATION(dst))
507{
508 // Compute source and destination addresses
509 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
510 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
511#if defined(ADD_BIAS)
512 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
513#endif // defined(ADD_BIAS)
514
515 int16 input_values = vload16(0, (__global int *)src.ptr);
516
Gian Marco58c57942017-11-28 09:10:03 +0000517 // Add the offset terms to GEMM's result
518 input_values += (int16)RESULT_OFFSET;
519
Gian Marco05288a22017-11-21 10:57:50 +0000520#if defined(ADD_BIAS)
521 // Add bias
522 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
523 input_values += (int16)biases_values;
524#endif // defined(ADD_BIAS)
525
Georgios Pinitas45bcc3a2017-11-29 11:06:49 +0000526 // Multiply by result_mult_int and shift
Gian Marco58c57942017-11-28 09:10:03 +0000527 input_values *= RESULT_MULT_INT;
Gian Marco05288a22017-11-21 10:57:50 +0000528
Gian Marco58c57942017-11-28 09:10:03 +0000529 input_values >>= RESULT_SHIFT;
Gian Marco05288a22017-11-21 10:57:50 +0000530
531 uchar16 res = convert_uchar16_sat(input_values);
532
533#if defined(MIN_BOUND)
534 res = max(res, (uchar16)MIN_BOUND);
535#endif // defined(MIN_BOUND)
536#if defined(MAX_BOUND)
537 res = min(res, (uchar16)MAX_BOUND);
538#endif // defined(MAX_BOUND)
539
540 // Store the result
541 vstore16(res, 0, dst.ptr);
542}
Gian Marco58c57942017-11-28 09:10:03 +0000543#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
544
545#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
546/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
547 *
548 * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
549 * The following computations will be performed by the kernel:
550 *
551 * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
552 * -# Add bias to final result if bias tensor is not a nullptr
553 * -# Round to nearest division by a power-of-two using result_shift
554 * -# Add offset to each result
555 * -# Clamp the value between the specified min and max bounds
556 * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
557 *
558 * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
559 *
560 * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
561 * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
562 * These values can be used to implement "rectified linear unit" activation functions
563 *
564 * @param[in] src_ptr Pointer to the source tensor. Supported data type: S32
565 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
566 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
567 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
568 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
569 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
570 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
571 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
572 * @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr
573 * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
574 * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
575 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
576 * @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8
577 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
578 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
579 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
580 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
581 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
582 * @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
583 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
584 */
585__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
586#if defined(ADD_BIAS)
587 VECTOR_DECLARATION(biases),
588#endif // defined(ADD_BIAS)
589 TENSOR3D_DECLARATION(dst))
590{
591 // Compute source and destination addresses
592 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
593 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
594#if defined(ADD_BIAS)
595 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
596#endif // defined(ADD_BIAS)
597
598 int16 input_values = vload16(0, (__global int *)src.ptr);
599
600#if defined(ADD_BIAS)
601 // Add bias
602 const int16 biases_values = vload16(0, (__global int *)biases.ptr);
603 input_values += (int16)biases_values;
604#endif // defined(ADD_BIAS)
605
606 // Multiply by result_mult_int and shift
607 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
608
609 // Add the offset terms to GEMM's result
610 input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
611
612 uchar16 res = convert_uchar16_sat(input_values);
613
614#if defined(MIN_BOUND)
615 res = max(res, (uchar16)MIN_BOUND);
616#endif // defined(MIN_BOUND)
617#if defined(MAX_BOUND)
618 res = min(res, (uchar16)MAX_BOUND);
619#endif // defined(MAX_BOUND)
620
621 // Store the result
622 vstore16(res, 0, dst.ptr);
623}
Chunosov5124be52017-11-22 20:42:13 +0700624#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)