blob: bd415bb4a7f322a989eeb69e37dc2e6da7e901e1 [file] [log] [blame]
Gunes Bayir9d0c4de2023-04-13 18:22:58 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010026#include "activation_float_helpers.h"
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010027
28#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
29/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
30 *
31 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
32 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
33 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
34 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
35 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010036 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010037 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
38 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
39 * @note Only the following configurations of M0, N0 and K0 are currently supported:
40 * - M0 > 0
41 * - N0 = 1, 2, 3, 4, 8, 16
42 * - K0 = 1, 2, 3, 4, 8, 16
43 * @note Values > 8 for M0 are not expected to be efficient
44 *
45 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
46 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
47 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
48 * @param[in] lhs_w The width of the lhs tensor
49 * @param[in] lhs_h The height of the lhs tensor
50 * @param[in] lhs_n Number of the matrices (buffers) in the batch
51 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
52 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
53 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
54 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
55 * @param[in] rhs_w The width of the rhs tensor
56 * @param[in] rhs_h The height of the rhs tensor
57 * @param[in] rhs_n Number of the matrices (buffers) in the batch
58 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
59 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
60 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
61 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
62 * @param[in] dst_w The width of the dst tensor
63 * @param[in] dst_h The height of the dst tensor
64 * @param[in] dst_n Number of the matrices (buffers) in the batch
65 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
66 */
67__kernel void mat_mul_native_quantized_nt_nt(
68 TENSOR3D_T(lhs, BUFFER),
69 TENSOR3D_T(rhs, BUFFER),
70 TENSOR3D_T(dst, BUFFER))
71{
72 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
73 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
74 const uint z = GET_SPATIAL_IDX(2, 1, 0);
75
76 // Compute LHS/RHS/DST matrix address
77 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
78 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
79 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
80
81 // Initialize the accumulators
82 TILE(int, M0, N0, acc);
83 LOOP_UNROLLING(int, i, 0, 1, M0,
84 {
85 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
86 })
87
88 TILE(int, 1, N0, b_sum);
89 b_sum[0].v = 0;
90
91 TILE(int, 1, M0, a_sum);
92 a_sum[0].v = 0;
93
94 int k;
95 for(k = 0; k <= K - K0; k += K0)
96 {
97 TILE(DATA_TYPE, M0, K0, a);
98 TILE(DATA_TYPE, N0, K0, b);
99
100 LOOP_UNROLLING(int, i, 0, 1, M0,
101 {
102 a[i].v = 0;
103 })
104
105 LOOP_UNROLLING(int, i, 0, 1, N0,
106 {
107 b[i].v = 0;
108 })
109
110 // Load tile from the lhs tensor
111 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
112
113 // Load tile from the rhs tensor in a transposed fashion
114 // in order to use T_MMUL_NT_T macro because only this macro
115 // can utilize dot product instruction for Int8/UInt8 by
116 // directly multiplying the rows of Lhs and Rhs tensors.
117 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
118
119 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
120
121 LOOP_UNROLLING(int, i, 0, 1, M0,
122 {
123 LOOP_UNROLLING(int, j, 0, 1, K0,
124 {
125 a_sum[0].s[i] += (int)a[i].s[j];
126 })
127 })
128
129 LOOP_UNROLLING(int, i, 0, 1, K0,
130 {
131 LOOP_UNROLLING(int, j, 0, 1, N0,
132 {
133 b_sum[0].s[j] += (int)b[j].s[i];
134 })
135 })
136
137 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
138 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
139 }
140
141#if((K % K0) != 0)
142 /* Leftover Loop */
143 for(; k < K; ++k)
144 {
145 TILE(DATA_TYPE, M0, 1, a);
146 TILE(DATA_TYPE, N0, 1, b);
147
148 LOOP_UNROLLING(int, i, 0, 1, M0,
149 {
150 a[i].v = 0;
151 })
152
153 LOOP_UNROLLING(int, i, 0, 1, N0,
154 {
155 b[i].v = 0;
156 })
157
158 // Load tile from the lhs tensor
159 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
160
161 // Load tile from the rhs tensor in a transposed fashion.
162 // See the main loop for more explanation
163 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
164
165 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
166
167 LOOP_UNROLLING(int, i, 0, 1, M0,
168 {
169 LOOP_UNROLLING(int, j, 0, 1, 1,
170 {
171 a_sum[0].s[i] += (int)a[i].s[j];
172 })
173 })
174
175 LOOP_UNROLLING(int, i, 0, 1, 1,
176 {
177 LOOP_UNROLLING(int, j, 0, 1, N0,
178 {
179 b_sum[0].s[j] += (int)b[j].s[i];
180 })
181 })
182
183 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
184 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
185 }
186#endif // ((K % K0) != 0)
187
188 LOOP_UNROLLING(int, i, 0, 1, M0,
189 {
190 LOOP_UNROLLING(int, j, 0, 1, N0,
191 {
192 acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
193 })
194 })
195
196 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
197 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
198
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100199 T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
200
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100201 // Quantize the tile
202 TILE(DATA_TYPE, M0, N0, accq);
203 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
204
205 TILE(int, M0, 1, indirect_buffer);
206 LOOP_UNROLLING(int, _i, 0, 1, M0,
207 {
208 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
209 });
210
211 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
212}
213#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
214
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100215#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
216/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
217 *
218 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
219 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
220 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
221 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
222 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100223 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100224 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
225 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
226 * @note Only the following configurations of M0, N0 and K0 are currently supported:
227 * - M0 > 0
228 * - N0 = 1, 2, 3, 4, 8, 16
229 * - K0 = 1, 2, 3, 4, 8, 16
230 * @note Values > 8 for M0, N0, K0 are not expected to be efficient
231 *
232 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
233 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
234 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
235 * @param[in] lhs_w The width of the lhs tensor
236 * @param[in] lhs_h The height of the lhs tensor
237 * @param[in] lhs_n Number of the matrices (buffers) in the batch
238 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
239 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
240 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
241 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
242 * @param[in] rhs_w The width of the rhs tensor
243 * @param[in] rhs_h The height of the rhs tensor
244 * @param[in] rhs_n Number of the matrices (buffers) in the batch
245 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
246 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
247 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
248 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
249 * @param[in] dst_w The width of the dst tensor
250 * @param[in] dst_h The height of the dst tensor
251 * @param[in] dst_n Number of the matrices (buffers) in the batch
252 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
253 */
254__kernel void mat_mul_native_quantized_nt_t(
255 TENSOR3D_T(lhs, BUFFER),
256 TENSOR3D_T(rhs, BUFFER),
257 TENSOR3D_T(dst, BUFFER))
258{
259 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
260 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
261 const uint z = GET_SPATIAL_IDX(2, 1, 0);
262
263 // Compute LHS/RHS/DST matrix address
264 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
265 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
266 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
267
268 // Initialize the accumulators
269 TILE(int, M0, N0, acc);
270 LOOP_UNROLLING(int, i, 0, 1, M0,
271 {
272 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
273 })
274
275 TILE(int, 1, M0, a_sum);
276 a_sum[0].v = 0;
277
278 TILE(int, 1, N0, b_sum);
279 b_sum[0].v = 0;
280
281 int k;
282 for(k = 0; k <= K - K0; k += K0)
283 {
284 TILE(DATA_TYPE, M0, K0, a);
285 TILE(DATA_TYPE, N0, K0, b);
286
287 LOOP_UNROLLING(int, i, 0, 1, M0,
288 {
289 a[i].v = 0;
290 })
291
292 LOOP_UNROLLING(int, i, 0, 1, N0,
293 {
294 b[i].v = 0;
295 })
296
297 // Load tile from lhs/rhs tensors
298 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
299 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
300
301 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
302
303 LOOP_UNROLLING(int, i, 0, 1, M0,
304 {
305 LOOP_UNROLLING(int, j, 0, 1, K0,
306 {
307 a_sum[0].s[i] += (int)a[i].s[j];
308 })
309 })
310
311 LOOP_UNROLLING(int, i, 0, 1, N0,
312 {
313 LOOP_UNROLLING(int, j, 0, 1, K0,
314 {
315 b_sum[0].s[i] += (int)b[i].s[j];
316 })
317 })
318
319 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
320 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
321 }
322
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100323#if((K % K0) != 0)
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100324 // Leftover loop
325 for(; k < K; ++k)
326 {
327 TILE(DATA_TYPE, M0, 1, a);
328 TILE(DATA_TYPE, N0, 1, b);
329
330 LOOP_UNROLLING(int, i, 0, 1, M0,
331 {
332 a[i].v = 0;
333 })
334
335 LOOP_UNROLLING(int, i, 0, 1, N0,
336 {
337 b[i].v = 0;
338 })
339
340 // Load tile from lhs/rhs tensors
341 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
342 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
343
344 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
345
346 LOOP_UNROLLING(int, i, 0, 1, M0,
347 {
348 LOOP_UNROLLING(int, j, 0, 1, 1,
349 {
350 a_sum[0].s[i] += (int)a[i].s[j];
351 })
352 })
353
354 LOOP_UNROLLING(int, i, 0, 1, N0,
355 {
356 LOOP_UNROLLING(int, j, 0, 1, 1,
357 {
358 b_sum[0].s[i] += (int)b[i].s[j];
359 })
360 })
361
362 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
363 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
364 }
365#endif // ((K % K0) != 0)
366
367 LOOP_UNROLLING(int, i, 0, 1, M0,
368 {
369 LOOP_UNROLLING(int, j, 0, 1, N0,
370 {
371 acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
372 })
373 })
374
375 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
376 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
377
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100378 T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
379
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100380 // Quantize the tile
381 TILE(DATA_TYPE, M0, N0, accq);
382 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
383
384 TILE(int, M0, 1, indirect_buffer);
385 LOOP_UNROLLING(int, _i, 0, 1, M0,
386 {
387 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
388 });
389
390 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
391}
392#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
393
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100394#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
395/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
396 *
397 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
398 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
399 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
400 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
401 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100402 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100403 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
404 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
405 * @note Only the following configurations of M0, N0 and K0 are currently supported:
406 * - M0 > 0
407 * - N0 = 1, 2, 3, 4, 8, 16
408 * - K0 = 1, 2, 3, 4, 8, 16
409 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
410 *
411 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
412 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
413 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
414 * @param[in] lhs_w The width of the lhs tensor
415 * @param[in] lhs_h The height of the lhs tensor
416 * @param[in] lhs_n Number of the matrices (buffers) in the batch
417 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
418 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
419 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
420 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
421 * @param[in] rhs_w The width of the rhs tensor
422 * @param[in] rhs_h The height of the rhs tensor
423 * @param[in] rhs_n Number of the matrices (buffers) in the batch
424 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
425 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
426 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
427 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
428 * @param[in] dst_w The width of the dst tensor
429 * @param[in] dst_h The height of the dst tensor
430 * @param[in] dst_n Number of the matrices (buffers) in the batch
431 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
432 */
433__kernel void mat_mul_native_quantized_t_nt(
434 TENSOR3D_T(lhs, BUFFER),
435 TENSOR3D_T(rhs, BUFFER),
436 TENSOR3D_T(dst, BUFFER))
437{
438 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
439 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
440 const uint z = GET_SPATIAL_IDX(2, 1, 0);
441
442 // Compute LHS/RHS/DST matrix address
443 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
444 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
445 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
446
447 // Initialize the accumulators
448 TILE(int, M0, N0, acc);
449 LOOP_UNROLLING(int, i, 0, 1, M0,
450 {
451 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
452 })
453
454 TILE(int, 1, N0, b_sum);
455 b_sum[0].v = 0;
456
457 TILE(int, 1, M0, a_sum);
458 a_sum[0].v = 0;
459
460 int k;
461 for(k = 0; k <= K - K0; k += K0)
462 {
463 TILE(DATA_TYPE, M0, K0, a);
464 TILE(DATA_TYPE, N0, K0, b);
465
466 LOOP_UNROLLING(int, i, 0, 1, M0,
467 {
468 a[i].v = 0;
469 })
470
471 LOOP_UNROLLING(int, i, 0, 1, N0,
472 {
473 b[i].v = 0;
474 })
475
476 // Load tile from the lhs/rhs tensors in a transposed fashion
477 // see mat_mul_native_quantized_nt_nt main loop for more explanation
478 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
479 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
480
481 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
482
483 LOOP_UNROLLING(int, i, 0, 1, K0,
484 {
485 LOOP_UNROLLING(int, j, 0, 1, M0,
486 {
487 a_sum[0].s[j] += (int)a[j].s[i];
488 })
489 })
490
491 LOOP_UNROLLING(int, i, 0, 1, K0,
492 {
493 LOOP_UNROLLING(int, j, 0, 1, N0,
494 {
495 b_sum[0].s[j] += (int)b[j].s[i];
496 })
497 })
498
499 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
500 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
501 }
502
503#if((K % K0) != 0)
504 /* Leftover Loop */
505 for(; k < K; ++k)
506 {
507 TILE(DATA_TYPE, M0, 1, a);
508 TILE(DATA_TYPE, N0, 1, b);
509
510 LOOP_UNROLLING(int, i, 0, 1, M0,
511 {
512 a[i].v = 0;
513 })
514
515 LOOP_UNROLLING(int, i, 0, 1, N0,
516 {
517 b[i].v = 0;
518 })
519
520 // Load tile from the lhs/rhs tensors in a transposed fashion
521 // see mat_mul_native_quantized_nt_nt main loop for more explanation
522 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
523 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
524
525 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
526
527 LOOP_UNROLLING(int, i, 0, 1, 1,
528 {
529 LOOP_UNROLLING(int, j, 0, 1, M0,
530 {
531 a_sum[0].s[j] += (int)a[j].s[i];
532 })
533 })
534
535 LOOP_UNROLLING(int, i, 0, 1, 1,
536 {
537 LOOP_UNROLLING(int, j, 0, 1, N0,
538 {
539 b_sum[0].s[j] += (int)b[j].s[i];
540 })
541 })
542
543 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
544 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
545 }
546#endif // ((K % K0) != 0)
547
548 LOOP_UNROLLING(int, i, 0, 1, M0,
549 {
550 LOOP_UNROLLING(int, j, 0, 1, N0,
551 {
552 acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
553 })
554 })
555
556 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
557 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
558
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100559 T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
560
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100561 // Quantize the tile
562 TILE(DATA_TYPE, M0, N0, accq);
563 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
564
565 TILE(int, M0, 1, indirect_buffer);
566 LOOP_UNROLLING(int, _i, 0, 1, M0,
567 {
568 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
569 });
570
571 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
572}
573#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib467daef2023-04-13 14:56:23 +0100574
575#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
576/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
577 *
578 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
579 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
580 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
581 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
582 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100583 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Omar Al Khatib467daef2023-04-13 14:56:23 +0100584 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
585 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
586 * @note Only the following configurations of M0, N0 and K0 are currently supported:
587 * - M0 = 1, 2, 3, 4, 8, 16
588 * - N0 = 1, 2, 3, 4, 8, 16
589 * - K0 = 1, 2, 3, 4, 8, 16
590 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
591 *
592 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
593 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
594 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
595 * @param[in] lhs_w The width of the lhs tensor
596 * @param[in] lhs_h The height of the lhs tensor
597 * @param[in] lhs_n Number of the matrices (buffers) in the batch
598 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
599 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
600 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
601 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
602 * @param[in] rhs_w The width of the rhs tensor
603 * @param[in] rhs_h The height of the rhs tensor
604 * @param[in] rhs_n Number of the matrices (buffers) in the batch
605 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
606 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
607 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
608 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
609 * @param[in] dst_w The width of the dst tensor
610 * @param[in] dst_h The height of the dst tensor
611 * @param[in] dst_n Number of the matrices (buffers) in the batch
612 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
613 */
614__kernel void mat_mul_native_quantized_t_t(
615 TENSOR3D_T(lhs, BUFFER),
616 TENSOR3D_T(rhs, BUFFER),
617 TENSOR3D_T(dst, BUFFER))
618{
619 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
620 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
621 const uint z = GET_SPATIAL_IDX(2, 1, 0);
622
623 // Compute LHS/RHS/DST matrix address
624 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
625 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
626 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
627
628 // Initialize the accumulators
629 TILE(int, M0, N0, acc);
630 LOOP_UNROLLING(int, i, 0, 1, M0,
631 {
632 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
633 })
634
635 TILE(int, 1, M0, a_sum);
636 a_sum[0].v = 0;
637
638 TILE(int, 1, N0, b_sum);
639 b_sum[0].v = 0;
640
641 int k;
642 for(k = 0; k <= K - K0; k += K0)
643 {
644 TILE(DATA_TYPE, M0, K0, a);
645 TILE(DATA_TYPE, N0, K0, b);
646
647 LOOP_UNROLLING(int, i, 0, 1, M0,
648 {
649 a[i].v = 0;
650 })
651
652 LOOP_UNROLLING(int, i, 0, 1, N0,
653 {
654 b[i].v = 0;
655 })
656
657 // Load tile from the lhs tensor in a transposed fashion
658 // see mat_mul_native_quantized_nt_nt main loop for more explanation
659 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
660
661 // Load tile from the rhs tensor
662 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
663
664 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
665
666 LOOP_UNROLLING(int, i, 0, 1, K0,
667 {
668 LOOP_UNROLLING(int, j, 0, 1, M0,
669 {
670 a_sum[0].s[j] += (int)a[j].s[i];
671 })
672 })
673
674 LOOP_UNROLLING(int, i, 0, 1, N0,
675 {
676 LOOP_UNROLLING(int, j, 0, 1, K0,
677 {
678 b_sum[0].s[i] += (int)b[i].s[j];
679 })
680 })
681
682 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
683 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
684 }
685
686#if((K % K0) != 0)
687 /* Leftover Loop */
688 for(; k < K; ++k)
689 {
690 TILE(DATA_TYPE, M0, 1, a);
691 TILE(DATA_TYPE, N0, 1, b);
692
693 LOOP_UNROLLING(int, i, 0, 1, M0,
694 {
695 a[i].v = 0;
696 })
697
698 LOOP_UNROLLING(int, i, 0, 1, N0,
699 {
700 b[i].v = 0;
701 })
702
703 // Load tile from the lhs tensor in a transposed fashion
704 // see mat_mul_native_quantized_nt_nt main loop for more explanation
705 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
706
707 // Load tile from the rhs tensor
708 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
709
710 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
711
712 LOOP_UNROLLING(int, i, 0, 1, 1,
713 {
714 LOOP_UNROLLING(int, j, 0, 1, M0,
715 {
716 a_sum[0].s[j] += (int)a[j].s[i];
717 })
718 })
719
720 LOOP_UNROLLING(int, i, 0, 1, N0,
721 {
722 LOOP_UNROLLING(int, j, 0, 1, 1,
723 {
724 b_sum[0].s[i] += (int)b[i].s[j];
725 })
726 })
727
728 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
729 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
730 }
731#endif // ((K % K0) != 0)
732
733 LOOP_UNROLLING(int, i, 0, 1, M0,
734 {
735 LOOP_UNROLLING(int, j, 0, 1, N0,
736 {
737 acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
738 })
739 })
740
741 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
742 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
743
744 // Quantize the tile
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100745 T_ACTIVATION(int, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
746
Omar Al Khatib467daef2023-04-13 14:56:23 +0100747 TILE(DATA_TYPE, M0, N0, accq);
748 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
749
750 TILE(int, M0, 1, indirect_buffer);
751 LOOP_UNROLLING(int, _i, 0, 1, M0,
752 {
753 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
754 });
755
756 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
757}
758#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)