blob: 0c3cbca9a62b35fed0df389d3cf11063fcc6cc25 [file] [log] [blame]
Gunes Bayir9d0c4de2023-04-13 18:22:58 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
26
27#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
28/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
29 *
30 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
31 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
32 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
33 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
34 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
35 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
36 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
37 * @note Only the following configurations of M0, N0 and K0 are currently supported:
38 * - M0 > 0
39 * - N0 = 1, 2, 3, 4, 8, 16
40 * - K0 = 1, 2, 3, 4, 8, 16
41 * @note Values > 8 for M0 are not expected to be efficient
42 *
43 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
44 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
45 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
46 * @param[in] lhs_w The width of the lhs tensor
47 * @param[in] lhs_h The height of the lhs tensor
48 * @param[in] lhs_n Number of the matrices (buffers) in the batch
49 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
50 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
51 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
52 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
53 * @param[in] rhs_w The width of the rhs tensor
54 * @param[in] rhs_h The height of the rhs tensor
55 * @param[in] rhs_n Number of the matrices (buffers) in the batch
56 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
57 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
58 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
59 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
60 * @param[in] dst_w The width of the dst tensor
61 * @param[in] dst_h The height of the dst tensor
62 * @param[in] dst_n Number of the matrices (buffers) in the batch
63 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
64 */
65__kernel void mat_mul_native_quantized_nt_nt(
66 TENSOR3D_T(lhs, BUFFER),
67 TENSOR3D_T(rhs, BUFFER),
68 TENSOR3D_T(dst, BUFFER))
69{
70 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
71 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
72 const uint z = GET_SPATIAL_IDX(2, 1, 0);
73
74 // Compute LHS/RHS/DST matrix address
75 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
76 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
77 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
78
79 // Initialize the accumulators
80 TILE(int, M0, N0, acc);
81 LOOP_UNROLLING(int, i, 0, 1, M0,
82 {
83 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
84 })
85
86 TILE(int, 1, N0, b_sum);
87 b_sum[0].v = 0;
88
89 TILE(int, 1, M0, a_sum);
90 a_sum[0].v = 0;
91
92 int k;
93 for(k = 0; k <= K - K0; k += K0)
94 {
95 TILE(DATA_TYPE, M0, K0, a);
96 TILE(DATA_TYPE, N0, K0, b);
97
98 LOOP_UNROLLING(int, i, 0, 1, M0,
99 {
100 a[i].v = 0;
101 })
102
103 LOOP_UNROLLING(int, i, 0, 1, N0,
104 {
105 b[i].v = 0;
106 })
107
108 // Load tile from the lhs tensor
109 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
110
111 // Load tile from the rhs tensor in a transposed fashion
112 // in order to use T_MMUL_NT_T macro because only this macro
113 // can utilize dot product instruction for Int8/UInt8 by
114 // directly multiplying the rows of Lhs and Rhs tensors.
115 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
116
117 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
118
119 LOOP_UNROLLING(int, i, 0, 1, M0,
120 {
121 LOOP_UNROLLING(int, j, 0, 1, K0,
122 {
123 a_sum[0].s[i] += (int)a[i].s[j];
124 })
125 })
126
127 LOOP_UNROLLING(int, i, 0, 1, K0,
128 {
129 LOOP_UNROLLING(int, j, 0, 1, N0,
130 {
131 b_sum[0].s[j] += (int)b[j].s[i];
132 })
133 })
134
135 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
136 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
137 }
138
139#if((K % K0) != 0)
140 /* Leftover Loop */
141 for(; k < K; ++k)
142 {
143 TILE(DATA_TYPE, M0, 1, a);
144 TILE(DATA_TYPE, N0, 1, b);
145
146 LOOP_UNROLLING(int, i, 0, 1, M0,
147 {
148 a[i].v = 0;
149 })
150
151 LOOP_UNROLLING(int, i, 0, 1, N0,
152 {
153 b[i].v = 0;
154 })
155
156 // Load tile from the lhs tensor
157 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
158
159 // Load tile from the rhs tensor in a transposed fashion.
160 // See the main loop for more explanation
161 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
162
163 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
164
165 LOOP_UNROLLING(int, i, 0, 1, M0,
166 {
167 LOOP_UNROLLING(int, j, 0, 1, 1,
168 {
169 a_sum[0].s[i] += (int)a[i].s[j];
170 })
171 })
172
173 LOOP_UNROLLING(int, i, 0, 1, 1,
174 {
175 LOOP_UNROLLING(int, j, 0, 1, N0,
176 {
177 b_sum[0].s[j] += (int)b[j].s[i];
178 })
179 })
180
181 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
182 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
183 }
184#endif // ((K % K0) != 0)
185
186 LOOP_UNROLLING(int, i, 0, 1, M0,
187 {
188 LOOP_UNROLLING(int, j, 0, 1, N0,
189 {
190 acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
191 })
192 })
193
194 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
195 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
196
197 // Quantize the tile
198 TILE(DATA_TYPE, M0, N0, accq);
199 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
200
201 TILE(int, M0, 1, indirect_buffer);
202 LOOP_UNROLLING(int, _i, 0, 1, M0,
203 {
204 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
205 });
206
207 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
208}
209#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
210
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100211#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
212/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
213 *
214 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
215 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
216 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
217 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
218 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
219 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
220 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
221 * @note Only the following configurations of M0, N0 and K0 are currently supported:
222 * - M0 > 0
223 * - N0 = 1, 2, 3, 4, 8, 16
224 * - K0 = 1, 2, 3, 4, 8, 16
225 * @note Values > 8 for M0, N0, K0 are not expected to be efficient
226 *
227 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
228 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
229 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
230 * @param[in] lhs_w The width of the lhs tensor
231 * @param[in] lhs_h The height of the lhs tensor
232 * @param[in] lhs_n Number of the matrices (buffers) in the batch
233 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
234 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
235 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
236 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
237 * @param[in] rhs_w The width of the rhs tensor
238 * @param[in] rhs_h The height of the rhs tensor
239 * @param[in] rhs_n Number of the matrices (buffers) in the batch
240 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
241 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
242 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
243 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
244 * @param[in] dst_w The width of the dst tensor
245 * @param[in] dst_h The height of the dst tensor
246 * @param[in] dst_n Number of the matrices (buffers) in the batch
247 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
248 */
249__kernel void mat_mul_native_quantized_nt_t(
250 TENSOR3D_T(lhs, BUFFER),
251 TENSOR3D_T(rhs, BUFFER),
252 TENSOR3D_T(dst, BUFFER))
253{
254 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
255 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
256 const uint z = GET_SPATIAL_IDX(2, 1, 0);
257
258 // Compute LHS/RHS/DST matrix address
259 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
260 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
261 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
262
263 // Initialize the accumulators
264 TILE(int, M0, N0, acc);
265 LOOP_UNROLLING(int, i, 0, 1, M0,
266 {
267 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
268 })
269
270 TILE(int, 1, M0, a_sum);
271 a_sum[0].v = 0;
272
273 TILE(int, 1, N0, b_sum);
274 b_sum[0].v = 0;
275
276 int k;
277 for(k = 0; k <= K - K0; k += K0)
278 {
279 TILE(DATA_TYPE, M0, K0, a);
280 TILE(DATA_TYPE, N0, K0, b);
281
282 LOOP_UNROLLING(int, i, 0, 1, M0,
283 {
284 a[i].v = 0;
285 })
286
287 LOOP_UNROLLING(int, i, 0, 1, N0,
288 {
289 b[i].v = 0;
290 })
291
292 // Load tile from lhs/rhs tensors
293 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
294 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
295
296 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
297
298 LOOP_UNROLLING(int, i, 0, 1, M0,
299 {
300 LOOP_UNROLLING(int, j, 0, 1, K0,
301 {
302 a_sum[0].s[i] += (int)a[i].s[j];
303 })
304 })
305
306 LOOP_UNROLLING(int, i, 0, 1, N0,
307 {
308 LOOP_UNROLLING(int, j, 0, 1, K0,
309 {
310 b_sum[0].s[i] += (int)b[i].s[j];
311 })
312 })
313
314 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
315 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
316 }
317
318#if ((K % K0) != 0)
319 // Leftover loop
320 for(; k < K; ++k)
321 {
322 TILE(DATA_TYPE, M0, 1, a);
323 TILE(DATA_TYPE, N0, 1, b);
324
325 LOOP_UNROLLING(int, i, 0, 1, M0,
326 {
327 a[i].v = 0;
328 })
329
330 LOOP_UNROLLING(int, i, 0, 1, N0,
331 {
332 b[i].v = 0;
333 })
334
335 // Load tile from lhs/rhs tensors
336 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
337 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
338
339 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
340
341 LOOP_UNROLLING(int, i, 0, 1, M0,
342 {
343 LOOP_UNROLLING(int, j, 0, 1, 1,
344 {
345 a_sum[0].s[i] += (int)a[i].s[j];
346 })
347 })
348
349 LOOP_UNROLLING(int, i, 0, 1, N0,
350 {
351 LOOP_UNROLLING(int, j, 0, 1, 1,
352 {
353 b_sum[0].s[i] += (int)b[i].s[j];
354 })
355 })
356
357 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
358 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
359 }
360#endif // ((K % K0) != 0)
361
362 LOOP_UNROLLING(int, i, 0, 1, M0,
363 {
364 LOOP_UNROLLING(int, j, 0, 1, N0,
365 {
366 acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
367 })
368 })
369
370 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
371 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
372
373 // Quantize the tile
374 TILE(DATA_TYPE, M0, N0, accq);
375 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
376
377 TILE(int, M0, 1, indirect_buffer);
378 LOOP_UNROLLING(int, _i, 0, 1, M0,
379 {
380 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
381 });
382
383 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
384}
385#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
386
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100387#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
388/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
389 *
390 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
391 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
392 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
393 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
394 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
395 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
396 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
397 * @note Only the following configurations of M0, N0 and K0 are currently supported:
398 * - M0 > 0
399 * - N0 = 1, 2, 3, 4, 8, 16
400 * - K0 = 1, 2, 3, 4, 8, 16
401 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
402 *
403 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
404 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
405 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
406 * @param[in] lhs_w The width of the lhs tensor
407 * @param[in] lhs_h The height of the lhs tensor
408 * @param[in] lhs_n Number of the matrices (buffers) in the batch
409 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
410 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
411 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
412 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
413 * @param[in] rhs_w The width of the rhs tensor
414 * @param[in] rhs_h The height of the rhs tensor
415 * @param[in] rhs_n Number of the matrices (buffers) in the batch
416 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
417 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
418 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
419 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
420 * @param[in] dst_w The width of the dst tensor
421 * @param[in] dst_h The height of the dst tensor
422 * @param[in] dst_n Number of the matrices (buffers) in the batch
423 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
424 */
425__kernel void mat_mul_native_quantized_t_nt(
426 TENSOR3D_T(lhs, BUFFER),
427 TENSOR3D_T(rhs, BUFFER),
428 TENSOR3D_T(dst, BUFFER))
429{
430 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
431 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
432 const uint z = GET_SPATIAL_IDX(2, 1, 0);
433
434 // Compute LHS/RHS/DST matrix address
435 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
436 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
437 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
438
439 // Initialize the accumulators
440 TILE(int, M0, N0, acc);
441 LOOP_UNROLLING(int, i, 0, 1, M0,
442 {
443 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
444 })
445
446 TILE(int, 1, N0, b_sum);
447 b_sum[0].v = 0;
448
449 TILE(int, 1, M0, a_sum);
450 a_sum[0].v = 0;
451
452 int k;
453 for(k = 0; k <= K - K0; k += K0)
454 {
455 TILE(DATA_TYPE, M0, K0, a);
456 TILE(DATA_TYPE, N0, K0, b);
457
458 LOOP_UNROLLING(int, i, 0, 1, M0,
459 {
460 a[i].v = 0;
461 })
462
463 LOOP_UNROLLING(int, i, 0, 1, N0,
464 {
465 b[i].v = 0;
466 })
467
468 // Load tile from the lhs/rhs tensors in a transposed fashion
469 // see mat_mul_native_quantized_nt_nt main loop for more explanation
470 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
471 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
472
473 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
474
475 LOOP_UNROLLING(int, i, 0, 1, K0,
476 {
477 LOOP_UNROLLING(int, j, 0, 1, M0,
478 {
479 a_sum[0].s[j] += (int)a[j].s[i];
480 })
481 })
482
483 LOOP_UNROLLING(int, i, 0, 1, K0,
484 {
485 LOOP_UNROLLING(int, j, 0, 1, N0,
486 {
487 b_sum[0].s[j] += (int)b[j].s[i];
488 })
489 })
490
491 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
492 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
493 }
494
495#if((K % K0) != 0)
496 /* Leftover Loop */
497 for(; k < K; ++k)
498 {
499 TILE(DATA_TYPE, M0, 1, a);
500 TILE(DATA_TYPE, N0, 1, b);
501
502 LOOP_UNROLLING(int, i, 0, 1, M0,
503 {
504 a[i].v = 0;
505 })
506
507 LOOP_UNROLLING(int, i, 0, 1, N0,
508 {
509 b[i].v = 0;
510 })
511
512 // Load tile from the lhs/rhs tensors in a transposed fashion
513 // see mat_mul_native_quantized_nt_nt main loop for more explanation
514 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
515 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
516
517 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
518
519 LOOP_UNROLLING(int, i, 0, 1, 1,
520 {
521 LOOP_UNROLLING(int, j, 0, 1, M0,
522 {
523 a_sum[0].s[j] += (int)a[j].s[i];
524 })
525 })
526
527 LOOP_UNROLLING(int, i, 0, 1, 1,
528 {
529 LOOP_UNROLLING(int, j, 0, 1, N0,
530 {
531 b_sum[0].s[j] += (int)b[j].s[i];
532 })
533 })
534
535 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
536 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
537 }
538#endif // ((K % K0) != 0)
539
540 LOOP_UNROLLING(int, i, 0, 1, M0,
541 {
542 LOOP_UNROLLING(int, j, 0, 1, N0,
543 {
544 acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
545 })
546 })
547
548 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
549 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
550
551 // Quantize the tile
552 TILE(DATA_TYPE, M0, N0, accq);
553 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
554
555 TILE(int, M0, 1, indirect_buffer);
556 LOOP_UNROLLING(int, _i, 0, 1, M0,
557 {
558 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
559 });
560
561 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
562}
563#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib467daef2023-04-13 14:56:23 +0100564
565#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
566/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
567 *
568 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
569 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
570 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
571 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
572 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
573 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
574 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
575 * @note Only the following configurations of M0, N0 and K0 are currently supported:
576 * - M0 = 1, 2, 3, 4, 8, 16
577 * - N0 = 1, 2, 3, 4, 8, 16
578 * - K0 = 1, 2, 3, 4, 8, 16
579 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
580 *
581 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
582 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
583 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
584 * @param[in] lhs_w The width of the lhs tensor
585 * @param[in] lhs_h The height of the lhs tensor
586 * @param[in] lhs_n Number of the matrices (buffers) in the batch
587 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
588 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
589 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
590 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
591 * @param[in] rhs_w The width of the rhs tensor
592 * @param[in] rhs_h The height of the rhs tensor
593 * @param[in] rhs_n Number of the matrices (buffers) in the batch
594 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
595 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
596 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
597 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
598 * @param[in] dst_w The width of the dst tensor
599 * @param[in] dst_h The height of the dst tensor
600 * @param[in] dst_n Number of the matrices (buffers) in the batch
601 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
602 */
603__kernel void mat_mul_native_quantized_t_t(
604 TENSOR3D_T(lhs, BUFFER),
605 TENSOR3D_T(rhs, BUFFER),
606 TENSOR3D_T(dst, BUFFER))
607{
608 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
609 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
610 const uint z = GET_SPATIAL_IDX(2, 1, 0);
611
612 // Compute LHS/RHS/DST matrix address
613 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
614 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
615 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
616
617 // Initialize the accumulators
618 TILE(int, M0, N0, acc);
619 LOOP_UNROLLING(int, i, 0, 1, M0,
620 {
621 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
622 })
623
624 TILE(int, 1, M0, a_sum);
625 a_sum[0].v = 0;
626
627 TILE(int, 1, N0, b_sum);
628 b_sum[0].v = 0;
629
630 int k;
631 for(k = 0; k <= K - K0; k += K0)
632 {
633 TILE(DATA_TYPE, M0, K0, a);
634 TILE(DATA_TYPE, N0, K0, b);
635
636 LOOP_UNROLLING(int, i, 0, 1, M0,
637 {
638 a[i].v = 0;
639 })
640
641 LOOP_UNROLLING(int, i, 0, 1, N0,
642 {
643 b[i].v = 0;
644 })
645
646 // Load tile from the lhs tensor in a transposed fashion
647 // see mat_mul_native_quantized_nt_nt main loop for more explanation
648 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
649
650 // Load tile from the rhs tensor
651 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
652
653 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
654
655 LOOP_UNROLLING(int, i, 0, 1, K0,
656 {
657 LOOP_UNROLLING(int, j, 0, 1, M0,
658 {
659 a_sum[0].s[j] += (int)a[j].s[i];
660 })
661 })
662
663 LOOP_UNROLLING(int, i, 0, 1, N0,
664 {
665 LOOP_UNROLLING(int, j, 0, 1, K0,
666 {
667 b_sum[0].s[i] += (int)b[i].s[j];
668 })
669 })
670
671 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
672 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
673 }
674
675#if((K % K0) != 0)
676 /* Leftover Loop */
677 for(; k < K; ++k)
678 {
679 TILE(DATA_TYPE, M0, 1, a);
680 TILE(DATA_TYPE, N0, 1, b);
681
682 LOOP_UNROLLING(int, i, 0, 1, M0,
683 {
684 a[i].v = 0;
685 })
686
687 LOOP_UNROLLING(int, i, 0, 1, N0,
688 {
689 b[i].v = 0;
690 })
691
692 // Load tile from the lhs tensor in a transposed fashion
693 // see mat_mul_native_quantized_nt_nt main loop for more explanation
694 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
695
696 // Load tile from the rhs tensor
697 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
698
699 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
700
701 LOOP_UNROLLING(int, i, 0, 1, 1,
702 {
703 LOOP_UNROLLING(int, j, 0, 1, M0,
704 {
705 a_sum[0].s[j] += (int)a[j].s[i];
706 })
707 })
708
709 LOOP_UNROLLING(int, i, 0, 1, N0,
710 {
711 LOOP_UNROLLING(int, j, 0, 1, 1,
712 {
713 b_sum[0].s[i] += (int)b[i].s[j];
714 })
715 })
716
717 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
718 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
719 }
720#endif // ((K % K0) != 0)
721
722 LOOP_UNROLLING(int, i, 0, 1, M0,
723 {
724 LOOP_UNROLLING(int, j, 0, 1, N0,
725 {
726 acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
727 })
728 })
729
730 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
731 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
732
733 // Quantize the tile
734 TILE(DATA_TYPE, M0, N0, accq);
735 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
736
737 TILE(int, M0, 1, indirect_buffer);
738 LOOP_UNROLLING(int, _i, 0, 1, M0,
739 {
740 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
741 });
742
743 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
744}
745#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)