blob: 7029af2188bbdc299be8aedfb0c856dd45e31232 [file] [log] [blame]
Gunes Bayir9d0c4de2023-04-13 18:22:58 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +010024#include "activation_float_helpers.h"
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010025#include "helpers.h"
26#include "tile_helpers.h"
27
28#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
29/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
30 *
31 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
32 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
33 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
34 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
35 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010036 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +010037 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010038 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
39 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
40 * @note Only the following configurations of M0, N0 and K0 are currently supported:
41 * - M0 > 0
42 * - N0 = 1, 2, 3, 4, 8, 16
43 * - K0 = 1, 2, 3, 4, 8, 16
44 * @note Values > 8 for M0 are not expected to be efficient
45 *
46 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
47 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
48 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
49 * @param[in] lhs_w The width of the lhs tensor
50 * @param[in] lhs_h The height of the lhs tensor
51 * @param[in] lhs_n Number of the matrices (buffers) in the batch
52 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
53 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
54 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
55 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
56 * @param[in] rhs_w The width of the rhs tensor
57 * @param[in] rhs_h The height of the rhs tensor
58 * @param[in] rhs_n Number of the matrices (buffers) in the batch
59 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
60 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
61 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
62 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
63 * @param[in] dst_w The width of the dst tensor
64 * @param[in] dst_h The height of the dst tensor
65 * @param[in] dst_n Number of the matrices (buffers) in the batch
66 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
67 */
68__kernel void mat_mul_native_quantized_nt_nt(
69 TENSOR3D_T(lhs, BUFFER),
70 TENSOR3D_T(rhs, BUFFER),
71 TENSOR3D_T(dst, BUFFER))
72{
73 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
74 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
75 const uint z = GET_SPATIAL_IDX(2, 1, 0);
76
77 // Compute LHS/RHS/DST matrix address
78 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
79 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
80 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
81
82 // Initialize the accumulators
83 TILE(int, M0, N0, acc);
84 LOOP_UNROLLING(int, i, 0, 1, M0,
85 {
86 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
87 })
88
89 TILE(int, 1, N0, b_sum);
90 b_sum[0].v = 0;
91
92 TILE(int, 1, M0, a_sum);
93 a_sum[0].v = 0;
94
95 int k;
96 for(k = 0; k <= K - K0; k += K0)
97 {
98 TILE(DATA_TYPE, M0, K0, a);
99 TILE(DATA_TYPE, N0, K0, b);
100
101 LOOP_UNROLLING(int, i, 0, 1, M0,
102 {
103 a[i].v = 0;
104 })
105
106 LOOP_UNROLLING(int, i, 0, 1, N0,
107 {
108 b[i].v = 0;
109 })
110
111 // Load tile from the lhs tensor
112 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
113
114 // Load tile from the rhs tensor in a transposed fashion
115 // in order to use T_MMUL_NT_T macro because only this macro
116 // can utilize dot product instruction for Int8/UInt8 by
117 // directly multiplying the rows of Lhs and Rhs tensors.
118 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
119
120 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
121
122 LOOP_UNROLLING(int, i, 0, 1, M0,
123 {
124 LOOP_UNROLLING(int, j, 0, 1, K0,
125 {
126 a_sum[0].s[i] += (int)a[i].s[j];
127 })
128 })
129
130 LOOP_UNROLLING(int, i, 0, 1, K0,
131 {
132 LOOP_UNROLLING(int, j, 0, 1, N0,
133 {
134 b_sum[0].s[j] += (int)b[j].s[i];
135 })
136 })
137
138 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
139 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
140 }
141
142#if((K % K0) != 0)
143 /* Leftover Loop */
144 for(; k < K; ++k)
145 {
146 TILE(DATA_TYPE, M0, 1, a);
147 TILE(DATA_TYPE, N0, 1, b);
148
149 LOOP_UNROLLING(int, i, 0, 1, M0,
150 {
151 a[i].v = 0;
152 })
153
154 LOOP_UNROLLING(int, i, 0, 1, N0,
155 {
156 b[i].v = 0;
157 })
158
159 // Load tile from the lhs tensor
160 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
161
162 // Load tile from the rhs tensor in a transposed fashion.
163 // See the main loop for more explanation
164 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
165
166 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
167
168 LOOP_UNROLLING(int, i, 0, 1, M0,
169 {
170 LOOP_UNROLLING(int, j, 0, 1, 1,
171 {
172 a_sum[0].s[i] += (int)a[i].s[j];
173 })
174 })
175
176 LOOP_UNROLLING(int, i, 0, 1, 1,
177 {
178 LOOP_UNROLLING(int, j, 0, 1, N0,
179 {
180 b_sum[0].s[j] += (int)b[j].s[i];
181 })
182 })
183
184 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
185 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
186 }
187#endif // ((K % K0) != 0)
188
189 LOOP_UNROLLING(int, i, 0, 1, M0,
190 {
191 LOOP_UNROLLING(int, j, 0, 1, N0,
192 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100193 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100194 })
195 })
196
197 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
198 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
199
200 // Quantize the tile
201 TILE(DATA_TYPE, M0, N0, accq);
202 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
203
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100204 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
205
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100206 TILE(int, M0, 1, indirect_buffer);
207 LOOP_UNROLLING(int, _i, 0, 1, M0,
208 {
209 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
210 });
211
212 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
213}
214#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
215
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100216#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
217/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
218 *
219 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
220 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
221 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
222 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
223 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100224 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100225 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100226 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
227 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
228 * @note Only the following configurations of M0, N0 and K0 are currently supported:
229 * - M0 > 0
230 * - N0 = 1, 2, 3, 4, 8, 16
231 * - K0 = 1, 2, 3, 4, 8, 16
232 * @note Values > 8 for M0, N0, K0 are not expected to be efficient
233 *
234 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
235 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
236 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
237 * @param[in] lhs_w The width of the lhs tensor
238 * @param[in] lhs_h The height of the lhs tensor
239 * @param[in] lhs_n Number of the matrices (buffers) in the batch
240 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
241 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
242 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
243 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
244 * @param[in] rhs_w The width of the rhs tensor
245 * @param[in] rhs_h The height of the rhs tensor
246 * @param[in] rhs_n Number of the matrices (buffers) in the batch
247 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
248 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
249 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
250 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
251 * @param[in] dst_w The width of the dst tensor
252 * @param[in] dst_h The height of the dst tensor
253 * @param[in] dst_n Number of the matrices (buffers) in the batch
254 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
255 */
256__kernel void mat_mul_native_quantized_nt_t(
257 TENSOR3D_T(lhs, BUFFER),
258 TENSOR3D_T(rhs, BUFFER),
259 TENSOR3D_T(dst, BUFFER))
260{
261 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
262 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
263 const uint z = GET_SPATIAL_IDX(2, 1, 0);
264
265 // Compute LHS/RHS/DST matrix address
266 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
267 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
268 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
269
270 // Initialize the accumulators
271 TILE(int, M0, N0, acc);
272 LOOP_UNROLLING(int, i, 0, 1, M0,
273 {
274 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
275 })
276
277 TILE(int, 1, M0, a_sum);
278 a_sum[0].v = 0;
279
280 TILE(int, 1, N0, b_sum);
281 b_sum[0].v = 0;
282
283 int k;
284 for(k = 0; k <= K - K0; k += K0)
285 {
286 TILE(DATA_TYPE, M0, K0, a);
287 TILE(DATA_TYPE, N0, K0, b);
288
289 LOOP_UNROLLING(int, i, 0, 1, M0,
290 {
291 a[i].v = 0;
292 })
293
294 LOOP_UNROLLING(int, i, 0, 1, N0,
295 {
296 b[i].v = 0;
297 })
298
299 // Load tile from lhs/rhs tensors
300 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
301 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
302
303 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
304
305 LOOP_UNROLLING(int, i, 0, 1, M0,
306 {
307 LOOP_UNROLLING(int, j, 0, 1, K0,
308 {
309 a_sum[0].s[i] += (int)a[i].s[j];
310 })
311 })
312
313 LOOP_UNROLLING(int, i, 0, 1, N0,
314 {
315 LOOP_UNROLLING(int, j, 0, 1, K0,
316 {
317 b_sum[0].s[i] += (int)b[i].s[j];
318 })
319 })
320
321 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
322 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
323 }
324
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100325#if((K % K0) != 0)
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100326 // Leftover loop
327 for(; k < K; ++k)
328 {
329 TILE(DATA_TYPE, M0, 1, a);
330 TILE(DATA_TYPE, N0, 1, b);
331
332 LOOP_UNROLLING(int, i, 0, 1, M0,
333 {
334 a[i].v = 0;
335 })
336
337 LOOP_UNROLLING(int, i, 0, 1, N0,
338 {
339 b[i].v = 0;
340 })
341
342 // Load tile from lhs/rhs tensors
343 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
344 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
345
346 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
347
348 LOOP_UNROLLING(int, i, 0, 1, M0,
349 {
350 LOOP_UNROLLING(int, j, 0, 1, 1,
351 {
352 a_sum[0].s[i] += (int)a[i].s[j];
353 })
354 })
355
356 LOOP_UNROLLING(int, i, 0, 1, N0,
357 {
358 LOOP_UNROLLING(int, j, 0, 1, 1,
359 {
360 b_sum[0].s[i] += (int)b[i].s[j];
361 })
362 })
363
364 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
365 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
366 }
367#endif // ((K % K0) != 0)
368
369 LOOP_UNROLLING(int, i, 0, 1, M0,
370 {
371 LOOP_UNROLLING(int, j, 0, 1, N0,
372 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100373 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100374 })
375 })
376
377 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
378 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
379
380 // Quantize the tile
381 TILE(DATA_TYPE, M0, N0, accq);
382 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
383
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100384 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
385
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100386 TILE(int, M0, 1, indirect_buffer);
387 LOOP_UNROLLING(int, _i, 0, 1, M0,
388 {
389 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
390 });
391
392 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
393}
394#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
395
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100396#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
397/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
398 *
399 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
400 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
401 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
402 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
403 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100404 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100405 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100406 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
407 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
408 * @note Only the following configurations of M0, N0 and K0 are currently supported:
409 * - M0 > 0
410 * - N0 = 1, 2, 3, 4, 8, 16
411 * - K0 = 1, 2, 3, 4, 8, 16
412 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
413 *
414 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
415 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
416 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
417 * @param[in] lhs_w The width of the lhs tensor
418 * @param[in] lhs_h The height of the lhs tensor
419 * @param[in] lhs_n Number of the matrices (buffers) in the batch
420 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
421 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
422 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
423 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
424 * @param[in] rhs_w The width of the rhs tensor
425 * @param[in] rhs_h The height of the rhs tensor
426 * @param[in] rhs_n Number of the matrices (buffers) in the batch
427 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
428 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
429 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
430 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
431 * @param[in] dst_w The width of the dst tensor
432 * @param[in] dst_h The height of the dst tensor
433 * @param[in] dst_n Number of the matrices (buffers) in the batch
434 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
435 */
436__kernel void mat_mul_native_quantized_t_nt(
437 TENSOR3D_T(lhs, BUFFER),
438 TENSOR3D_T(rhs, BUFFER),
439 TENSOR3D_T(dst, BUFFER))
440{
441 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
442 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
443 const uint z = GET_SPATIAL_IDX(2, 1, 0);
444
445 // Compute LHS/RHS/DST matrix address
446 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
447 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
448 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
449
450 // Initialize the accumulators
451 TILE(int, M0, N0, acc);
452 LOOP_UNROLLING(int, i, 0, 1, M0,
453 {
454 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
455 })
456
457 TILE(int, 1, N0, b_sum);
458 b_sum[0].v = 0;
459
460 TILE(int, 1, M0, a_sum);
461 a_sum[0].v = 0;
462
463 int k;
464 for(k = 0; k <= K - K0; k += K0)
465 {
466 TILE(DATA_TYPE, M0, K0, a);
467 TILE(DATA_TYPE, N0, K0, b);
468
469 LOOP_UNROLLING(int, i, 0, 1, M0,
470 {
471 a[i].v = 0;
472 })
473
474 LOOP_UNROLLING(int, i, 0, 1, N0,
475 {
476 b[i].v = 0;
477 })
478
479 // Load tile from the lhs/rhs tensors in a transposed fashion
480 // see mat_mul_native_quantized_nt_nt main loop for more explanation
481 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
482 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
483
484 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
485
486 LOOP_UNROLLING(int, i, 0, 1, K0,
487 {
488 LOOP_UNROLLING(int, j, 0, 1, M0,
489 {
490 a_sum[0].s[j] += (int)a[j].s[i];
491 })
492 })
493
494 LOOP_UNROLLING(int, i, 0, 1, K0,
495 {
496 LOOP_UNROLLING(int, j, 0, 1, N0,
497 {
498 b_sum[0].s[j] += (int)b[j].s[i];
499 })
500 })
501
502 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
503 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
504 }
505
506#if((K % K0) != 0)
507 /* Leftover Loop */
508 for(; k < K; ++k)
509 {
510 TILE(DATA_TYPE, M0, 1, a);
511 TILE(DATA_TYPE, N0, 1, b);
512
513 LOOP_UNROLLING(int, i, 0, 1, M0,
514 {
515 a[i].v = 0;
516 })
517
518 LOOP_UNROLLING(int, i, 0, 1, N0,
519 {
520 b[i].v = 0;
521 })
522
523 // Load tile from the lhs/rhs tensors in a transposed fashion
524 // see mat_mul_native_quantized_nt_nt main loop for more explanation
525 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
526 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
527
528 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
529
530 LOOP_UNROLLING(int, i, 0, 1, 1,
531 {
532 LOOP_UNROLLING(int, j, 0, 1, M0,
533 {
534 a_sum[0].s[j] += (int)a[j].s[i];
535 })
536 })
537
538 LOOP_UNROLLING(int, i, 0, 1, 1,
539 {
540 LOOP_UNROLLING(int, j, 0, 1, N0,
541 {
542 b_sum[0].s[j] += (int)b[j].s[i];
543 })
544 })
545
546 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
547 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
548 }
549#endif // ((K % K0) != 0)
550
551 LOOP_UNROLLING(int, i, 0, 1, M0,
552 {
553 LOOP_UNROLLING(int, j, 0, 1, N0,
554 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100555 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100556 })
557 })
558
559 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
560 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
561
562 // Quantize the tile
563 TILE(DATA_TYPE, M0, N0, accq);
564 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
565
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100566 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
567
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100568 TILE(int, M0, 1, indirect_buffer);
569 LOOP_UNROLLING(int, _i, 0, 1, M0,
570 {
571 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
572 });
573
574 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
575}
576#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib467daef2023-04-13 14:56:23 +0100577
578#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
579/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
580 *
581 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
582 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
583 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
584 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
585 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100586 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100587 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Omar Al Khatib467daef2023-04-13 14:56:23 +0100588 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
589 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
590 * @note Only the following configurations of M0, N0 and K0 are currently supported:
591 * - M0 = 1, 2, 3, 4, 8, 16
592 * - N0 = 1, 2, 3, 4, 8, 16
593 * - K0 = 1, 2, 3, 4, 8, 16
594 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
595 *
596 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
597 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
598 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
599 * @param[in] lhs_w The width of the lhs tensor
600 * @param[in] lhs_h The height of the lhs tensor
601 * @param[in] lhs_n Number of the matrices (buffers) in the batch
602 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
603 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
604 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
605 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
606 * @param[in] rhs_w The width of the rhs tensor
607 * @param[in] rhs_h The height of the rhs tensor
608 * @param[in] rhs_n Number of the matrices (buffers) in the batch
609 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
610 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
611 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
612 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
613 * @param[in] dst_w The width of the dst tensor
614 * @param[in] dst_h The height of the dst tensor
615 * @param[in] dst_n Number of the matrices (buffers) in the batch
616 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
617 */
618__kernel void mat_mul_native_quantized_t_t(
619 TENSOR3D_T(lhs, BUFFER),
620 TENSOR3D_T(rhs, BUFFER),
621 TENSOR3D_T(dst, BUFFER))
622{
623 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
624 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
625 const uint z = GET_SPATIAL_IDX(2, 1, 0);
626
627 // Compute LHS/RHS/DST matrix address
628 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
629 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
630 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
631
632 // Initialize the accumulators
633 TILE(int, M0, N0, acc);
634 LOOP_UNROLLING(int, i, 0, 1, M0,
635 {
636 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
637 })
638
639 TILE(int, 1, M0, a_sum);
640 a_sum[0].v = 0;
641
642 TILE(int, 1, N0, b_sum);
643 b_sum[0].v = 0;
644
645 int k;
646 for(k = 0; k <= K - K0; k += K0)
647 {
648 TILE(DATA_TYPE, M0, K0, a);
649 TILE(DATA_TYPE, N0, K0, b);
650
651 LOOP_UNROLLING(int, i, 0, 1, M0,
652 {
653 a[i].v = 0;
654 })
655
656 LOOP_UNROLLING(int, i, 0, 1, N0,
657 {
658 b[i].v = 0;
659 })
660
661 // Load tile from the lhs tensor in a transposed fashion
662 // see mat_mul_native_quantized_nt_nt main loop for more explanation
663 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
664
665 // Load tile from the rhs tensor
666 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
667
668 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
669
670 LOOP_UNROLLING(int, i, 0, 1, K0,
671 {
672 LOOP_UNROLLING(int, j, 0, 1, M0,
673 {
674 a_sum[0].s[j] += (int)a[j].s[i];
675 })
676 })
677
678 LOOP_UNROLLING(int, i, 0, 1, N0,
679 {
680 LOOP_UNROLLING(int, j, 0, 1, K0,
681 {
682 b_sum[0].s[i] += (int)b[i].s[j];
683 })
684 })
685
686 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
687 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
688 }
689
690#if((K % K0) != 0)
691 /* Leftover Loop */
692 for(; k < K; ++k)
693 {
694 TILE(DATA_TYPE, M0, 1, a);
695 TILE(DATA_TYPE, N0, 1, b);
696
697 LOOP_UNROLLING(int, i, 0, 1, M0,
698 {
699 a[i].v = 0;
700 })
701
702 LOOP_UNROLLING(int, i, 0, 1, N0,
703 {
704 b[i].v = 0;
705 })
706
707 // Load tile from the lhs tensor in a transposed fashion
708 // see mat_mul_native_quantized_nt_nt main loop for more explanation
709 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
710
711 // Load tile from the rhs tensor
712 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
713
714 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
715
716 LOOP_UNROLLING(int, i, 0, 1, 1,
717 {
718 LOOP_UNROLLING(int, j, 0, 1, M0,
719 {
720 a_sum[0].s[j] += (int)a[j].s[i];
721 })
722 })
723
724 LOOP_UNROLLING(int, i, 0, 1, N0,
725 {
726 LOOP_UNROLLING(int, j, 0, 1, 1,
727 {
728 b_sum[0].s[i] += (int)b[i].s[j];
729 })
730 })
731
732 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
733 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
734 }
735#endif // ((K % K0) != 0)
736
737 LOOP_UNROLLING(int, i, 0, 1, M0,
738 {
739 LOOP_UNROLLING(int, j, 0, 1, N0,
740 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100741 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Omar Al Khatib467daef2023-04-13 14:56:23 +0100742 })
743 })
744
745 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
746 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
747
748 // Quantize the tile
749 TILE(DATA_TYPE, M0, N0, accq);
750 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
751
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100752 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
753
Omar Al Khatib467daef2023-04-13 14:56:23 +0100754 TILE(int, M0, 1, indirect_buffer);
755 LOOP_UNROLLING(int, _i, 0, 1, M0,
756 {
757 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
758 });
759
760 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
761}
762#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)