blob: 7f81ac45490e1accfb23eead59756a49645b3cb8 [file] [log] [blame]
Gunes Bayir9d0c4de2023-04-13 18:22:58 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +010024#include "activation_float_helpers.h"
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010025#include "helpers.h"
26#include "tile_helpers.h"
27
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010028#ifdef BIAS
29// This function performs in-place bias addition for integer datatype when bias is enabled.
30// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
31inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
32{
33 TILE(int, 1, N0, bias_tile);
34
35 // below expands to use bias_ptr and bias_offset_first_element_in_bytes
36 T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
37
38 // c = c + bias[broadcasted]
39 T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
40}
41#endif // defined(BIAS)
42
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010043#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
44/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
45 *
46 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
47 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
48 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
49 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
50 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010051 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +010052 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010053 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
54 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
55 * @note Only the following configurations of M0, N0 and K0 are currently supported:
56 * - M0 > 0
57 * - N0 = 1, 2, 3, 4, 8, 16
58 * - K0 = 1, 2, 3, 4, 8, 16
59 * @note Values > 8 for M0 are not expected to be efficient
60 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010061 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
62 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
63 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
64 * @param[in] lhs_w The width of the lhs tensor
65 * @param[in] lhs_h The height of the lhs tensor
66 * @param[in] lhs_n Number of the matrices (buffers) in the batch
67 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
68 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
69 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
70 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
71 * @param[in] rhs_w The width of the rhs tensor
72 * @param[in] rhs_h The height of the rhs tensor
73 * @param[in] rhs_n Number of the matrices (buffers) in the batch
74 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
75 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
76 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
77 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
78 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
79 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
80 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
81 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
82 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
83 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
84 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
85 * @param[in] dst_w The width of the dst tensor
86 * @param[in] dst_h The height of the dst tensor
87 * @param[in] dst_n Number of the matrices (buffers) in the batch
88 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010089 */
90__kernel void mat_mul_native_quantized_nt_nt(
91 TENSOR3D_T(lhs, BUFFER),
92 TENSOR3D_T(rhs, BUFFER),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010093#ifdef BIAS
94 TENSOR3D_T(bias, BUFFER),
95#endif // defined(BIAS)
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010096 TENSOR3D_T(dst, BUFFER))
97{
98 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
99 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
100 const uint z = GET_SPATIAL_IDX(2, 1, 0);
101
102 // Compute LHS/RHS/DST matrix address
103 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
104 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
105 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
106
107 // Initialize the accumulators
108 TILE(int, M0, N0, acc);
109 LOOP_UNROLLING(int, i, 0, 1, M0,
110 {
111 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
112 })
113
114 TILE(int, 1, N0, b_sum);
115 b_sum[0].v = 0;
116
117 TILE(int, 1, M0, a_sum);
118 a_sum[0].v = 0;
119
120 int k;
121 for(k = 0; k <= K - K0; k += K0)
122 {
123 TILE(DATA_TYPE, M0, K0, a);
124 TILE(DATA_TYPE, N0, K0, b);
125
126 LOOP_UNROLLING(int, i, 0, 1, M0,
127 {
128 a[i].v = 0;
129 })
130
131 LOOP_UNROLLING(int, i, 0, 1, N0,
132 {
133 b[i].v = 0;
134 })
135
136 // Load tile from the lhs tensor
137 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
138
139 // Load tile from the rhs tensor in a transposed fashion
140 // in order to use T_MMUL_NT_T macro because only this macro
141 // can utilize dot product instruction for Int8/UInt8 by
142 // directly multiplying the rows of Lhs and Rhs tensors.
143 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
144
145 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
146
147 LOOP_UNROLLING(int, i, 0, 1, M0,
148 {
149 LOOP_UNROLLING(int, j, 0, 1, K0,
150 {
151 a_sum[0].s[i] += (int)a[i].s[j];
152 })
153 })
154
155 LOOP_UNROLLING(int, i, 0, 1, K0,
156 {
157 LOOP_UNROLLING(int, j, 0, 1, N0,
158 {
159 b_sum[0].s[j] += (int)b[j].s[i];
160 })
161 })
162
163 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
164 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
165 }
166
167#if((K % K0) != 0)
168 /* Leftover Loop */
169 for(; k < K; ++k)
170 {
171 TILE(DATA_TYPE, M0, 1, a);
172 TILE(DATA_TYPE, N0, 1, b);
173
174 LOOP_UNROLLING(int, i, 0, 1, M0,
175 {
176 a[i].v = 0;
177 })
178
179 LOOP_UNROLLING(int, i, 0, 1, N0,
180 {
181 b[i].v = 0;
182 })
183
184 // Load tile from the lhs tensor
185 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
186
187 // Load tile from the rhs tensor in a transposed fashion.
188 // See the main loop for more explanation
189 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
190
191 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
192
193 LOOP_UNROLLING(int, i, 0, 1, M0,
194 {
195 LOOP_UNROLLING(int, j, 0, 1, 1,
196 {
197 a_sum[0].s[i] += (int)a[i].s[j];
198 })
199 })
200
201 LOOP_UNROLLING(int, i, 0, 1, 1,
202 {
203 LOOP_UNROLLING(int, j, 0, 1, N0,
204 {
205 b_sum[0].s[j] += (int)b[j].s[i];
206 })
207 })
208
209 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
210 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
211 }
212#endif // ((K % K0) != 0)
213
214 LOOP_UNROLLING(int, i, 0, 1, M0,
215 {
216 LOOP_UNROLLING(int, j, 0, 1, N0,
217 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100218 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100219 })
220 })
221
222 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
223 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
224
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100225#ifdef BIAS
226 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
227#endif // defined(BIAS)
228
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100229 // Quantize the tile
230 TILE(DATA_TYPE, M0, N0, accq);
231 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
232
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100233 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
234
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100235 TILE(int, M0, 1, indirect_buffer);
236 LOOP_UNROLLING(int, _i, 0, 1, M0,
237 {
238 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
239 });
240
241 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
242}
243#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
244
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100245#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
246/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
247 *
248 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
249 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
250 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
251 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
252 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100253 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100254 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100255 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
256 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
257 * @note Only the following configurations of M0, N0 and K0 are currently supported:
258 * - M0 > 0
259 * - N0 = 1, 2, 3, 4, 8, 16
260 * - K0 = 1, 2, 3, 4, 8, 16
261 * @note Values > 8 for M0, N0, K0 are not expected to be efficient
262 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100263 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
264 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
265 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
266 * @param[in] lhs_w The width of the lhs tensor
267 * @param[in] lhs_h The height of the lhs tensor
268 * @param[in] lhs_n Number of the matrices (buffers) in the batch
269 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
270 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
271 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
272 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
273 * @param[in] rhs_w The width of the rhs tensor
274 * @param[in] rhs_h The height of the rhs tensor
275 * @param[in] rhs_n Number of the matrices (buffers) in the batch
276 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
277 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
278 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
279 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
280 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
281 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
282 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
283 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
284 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
285 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
286 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
287 * @param[in] dst_w The width of the dst tensor
288 * @param[in] dst_h The height of the dst tensor
289 * @param[in] dst_n Number of the matrices (buffers) in the batch
290 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100291 */
292__kernel void mat_mul_native_quantized_nt_t(
293 TENSOR3D_T(lhs, BUFFER),
294 TENSOR3D_T(rhs, BUFFER),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100295#ifdef BIAS
296 TENSOR3D_T(bias, BUFFER),
297#endif // defined(BIAS)
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100298 TENSOR3D_T(dst, BUFFER))
299{
300 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
301 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
302 const uint z = GET_SPATIAL_IDX(2, 1, 0);
303
304 // Compute LHS/RHS/DST matrix address
305 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
306 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
307 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
308
309 // Initialize the accumulators
310 TILE(int, M0, N0, acc);
311 LOOP_UNROLLING(int, i, 0, 1, M0,
312 {
313 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
314 })
315
316 TILE(int, 1, M0, a_sum);
317 a_sum[0].v = 0;
318
319 TILE(int, 1, N0, b_sum);
320 b_sum[0].v = 0;
321
322 int k;
323 for(k = 0; k <= K - K0; k += K0)
324 {
325 TILE(DATA_TYPE, M0, K0, a);
326 TILE(DATA_TYPE, N0, K0, b);
327
328 LOOP_UNROLLING(int, i, 0, 1, M0,
329 {
330 a[i].v = 0;
331 })
332
333 LOOP_UNROLLING(int, i, 0, 1, N0,
334 {
335 b[i].v = 0;
336 })
337
338 // Load tile from lhs/rhs tensors
339 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
340 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
341
342 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
343
344 LOOP_UNROLLING(int, i, 0, 1, M0,
345 {
346 LOOP_UNROLLING(int, j, 0, 1, K0,
347 {
348 a_sum[0].s[i] += (int)a[i].s[j];
349 })
350 })
351
352 LOOP_UNROLLING(int, i, 0, 1, N0,
353 {
354 LOOP_UNROLLING(int, j, 0, 1, K0,
355 {
356 b_sum[0].s[i] += (int)b[i].s[j];
357 })
358 })
359
360 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
361 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
362 }
363
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100364#if((K % K0) != 0)
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100365 // Leftover loop
366 for(; k < K; ++k)
367 {
368 TILE(DATA_TYPE, M0, 1, a);
369 TILE(DATA_TYPE, N0, 1, b);
370
371 LOOP_UNROLLING(int, i, 0, 1, M0,
372 {
373 a[i].v = 0;
374 })
375
376 LOOP_UNROLLING(int, i, 0, 1, N0,
377 {
378 b[i].v = 0;
379 })
380
381 // Load tile from lhs/rhs tensors
382 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
383 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
384
385 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
386
387 LOOP_UNROLLING(int, i, 0, 1, M0,
388 {
389 LOOP_UNROLLING(int, j, 0, 1, 1,
390 {
391 a_sum[0].s[i] += (int)a[i].s[j];
392 })
393 })
394
395 LOOP_UNROLLING(int, i, 0, 1, N0,
396 {
397 LOOP_UNROLLING(int, j, 0, 1, 1,
398 {
399 b_sum[0].s[i] += (int)b[i].s[j];
400 })
401 })
402
403 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
404 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
405 }
406#endif // ((K % K0) != 0)
407
408 LOOP_UNROLLING(int, i, 0, 1, M0,
409 {
410 LOOP_UNROLLING(int, j, 0, 1, N0,
411 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100412 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100413 })
414 })
415
416 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
417 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
418
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100419#ifdef BIAS
420 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
421#endif // defined(BIAS)
422
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100423 // Quantize the tile
424 TILE(DATA_TYPE, M0, N0, accq);
425 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
426
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100427 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
428
Jakub Sujak5e99a3e2023-04-18 08:33:56 +0100429 TILE(int, M0, 1, indirect_buffer);
430 LOOP_UNROLLING(int, _i, 0, 1, M0,
431 {
432 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
433 });
434
435 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
436}
437#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
438
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100439#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
440/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
441 *
442 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
443 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
444 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
445 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
446 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100447 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100448 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100449 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
450 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
451 * @note Only the following configurations of M0, N0 and K0 are currently supported:
452 * - M0 > 0
453 * - N0 = 1, 2, 3, 4, 8, 16
454 * - K0 = 1, 2, 3, 4, 8, 16
455 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
456 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100457 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
458 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
459 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
460 * @param[in] lhs_w The width of the lhs tensor
461 * @param[in] lhs_h The height of the lhs tensor
462 * @param[in] lhs_n Number of the matrices (buffers) in the batch
463 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
464 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
465 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
466 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
467 * @param[in] rhs_w The width of the rhs tensor
468 * @param[in] rhs_h The height of the rhs tensor
469 * @param[in] rhs_n Number of the matrices (buffers) in the batch
470 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
471 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
472 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
473 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
474 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
475 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
476 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
477 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
478 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
479 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
480 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
481 * @param[in] dst_w The width of the dst tensor
482 * @param[in] dst_h The height of the dst tensor
483 * @param[in] dst_n Number of the matrices (buffers) in the batch
484 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100485 */
486__kernel void mat_mul_native_quantized_t_nt(
487 TENSOR3D_T(lhs, BUFFER),
488 TENSOR3D_T(rhs, BUFFER),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100489#ifdef BIAS
490 TENSOR3D_T(bias, BUFFER),
491#endif // defined(BIAS)
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100492 TENSOR3D_T(dst, BUFFER))
493{
494 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
495 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
496 const uint z = GET_SPATIAL_IDX(2, 1, 0);
497
498 // Compute LHS/RHS/DST matrix address
499 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
500 rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
501 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
502
503 // Initialize the accumulators
504 TILE(int, M0, N0, acc);
505 LOOP_UNROLLING(int, i, 0, 1, M0,
506 {
507 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
508 })
509
510 TILE(int, 1, N0, b_sum);
511 b_sum[0].v = 0;
512
513 TILE(int, 1, M0, a_sum);
514 a_sum[0].v = 0;
515
516 int k;
517 for(k = 0; k <= K - K0; k += K0)
518 {
519 TILE(DATA_TYPE, M0, K0, a);
520 TILE(DATA_TYPE, N0, K0, b);
521
522 LOOP_UNROLLING(int, i, 0, 1, M0,
523 {
524 a[i].v = 0;
525 })
526
527 LOOP_UNROLLING(int, i, 0, 1, N0,
528 {
529 b[i].v = 0;
530 })
531
532 // Load tile from the lhs/rhs tensors in a transposed fashion
533 // see mat_mul_native_quantized_nt_nt main loop for more explanation
534 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
535 T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
536
537 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
538
539 LOOP_UNROLLING(int, i, 0, 1, K0,
540 {
541 LOOP_UNROLLING(int, j, 0, 1, M0,
542 {
543 a_sum[0].s[j] += (int)a[j].s[i];
544 })
545 })
546
547 LOOP_UNROLLING(int, i, 0, 1, K0,
548 {
549 LOOP_UNROLLING(int, j, 0, 1, N0,
550 {
551 b_sum[0].s[j] += (int)b[j].s[i];
552 })
553 })
554
555 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
556 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
557 }
558
559#if((K % K0) != 0)
560 /* Leftover Loop */
561 for(; k < K; ++k)
562 {
563 TILE(DATA_TYPE, M0, 1, a);
564 TILE(DATA_TYPE, N0, 1, b);
565
566 LOOP_UNROLLING(int, i, 0, 1, M0,
567 {
568 a[i].v = 0;
569 })
570
571 LOOP_UNROLLING(int, i, 0, 1, N0,
572 {
573 b[i].v = 0;
574 })
575
576 // Load tile from the lhs/rhs tensors in a transposed fashion
577 // see mat_mul_native_quantized_nt_nt main loop for more explanation
578 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
579 T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
580
581 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
582
583 LOOP_UNROLLING(int, i, 0, 1, 1,
584 {
585 LOOP_UNROLLING(int, j, 0, 1, M0,
586 {
587 a_sum[0].s[j] += (int)a[j].s[i];
588 })
589 })
590
591 LOOP_UNROLLING(int, i, 0, 1, 1,
592 {
593 LOOP_UNROLLING(int, j, 0, 1, N0,
594 {
595 b_sum[0].s[j] += (int)b[j].s[i];
596 })
597 })
598
599 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
600 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
601 }
602#endif // ((K % K0) != 0)
603
604 LOOP_UNROLLING(int, i, 0, 1, M0,
605 {
606 LOOP_UNROLLING(int, j, 0, 1, N0,
607 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100608 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100609 })
610 })
611
612 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
613 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
614
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100615#ifdef BIAS
616 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
617#endif // defined(BIAS)
618
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100619 // Quantize the tile
620 TILE(DATA_TYPE, M0, N0, accq);
621 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
622
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100623 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
624
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100625 TILE(int, M0, 1, indirect_buffer);
626 LOOP_UNROLLING(int, _i, 0, 1, M0,
627 {
628 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
629 });
630
631 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
632}
633#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
Omar Al Khatib467daef2023-04-13 14:56:23 +0100634
635#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
636/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
637 *
638 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
639 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
640 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
641 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
642 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100643 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100644 * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
Omar Al Khatib467daef2023-04-13 14:56:23 +0100645 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
646 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
647 * @note Only the following configurations of M0, N0 and K0 are currently supported:
648 * - M0 = 1, 2, 3, 4, 8, 16
649 * - N0 = 1, 2, 3, 4, 8, 16
650 * - K0 = 1, 2, 3, 4, 8, 16
651 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
652 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100653 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
654 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
655 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
656 * @param[in] lhs_w The width of the lhs tensor
657 * @param[in] lhs_h The height of the lhs tensor
658 * @param[in] lhs_n Number of the matrices (buffers) in the batch
659 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
660 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
661 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
662 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
663 * @param[in] rhs_w The width of the rhs tensor
664 * @param[in] rhs_h The height of the rhs tensor
665 * @param[in] rhs_n Number of the matrices (buffers) in the batch
666 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
667 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
668 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
669 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
670 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
671 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
672 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
673 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
674 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
675 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
676 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
677 * @param[in] dst_w The width of the dst tensor
678 * @param[in] dst_h The height of the dst tensor
679 * @param[in] dst_n Number of the matrices (buffers) in the batch
680 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Omar Al Khatib467daef2023-04-13 14:56:23 +0100681 */
682__kernel void mat_mul_native_quantized_t_t(
683 TENSOR3D_T(lhs, BUFFER),
684 TENSOR3D_T(rhs, BUFFER),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100685#ifdef BIAS
686 TENSOR3D_T(bias, BUFFER),
687#endif // defined(BIAS)
Omar Al Khatib467daef2023-04-13 14:56:23 +0100688 TENSOR3D_T(dst, BUFFER))
689{
690 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
691 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
692 const uint z = GET_SPATIAL_IDX(2, 1, 0);
693
694 // Compute LHS/RHS/DST matrix address
695 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
696 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
697 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
698
699 // Initialize the accumulators
700 TILE(int, M0, N0, acc);
701 LOOP_UNROLLING(int, i, 0, 1, M0,
702 {
703 acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
704 })
705
706 TILE(int, 1, M0, a_sum);
707 a_sum[0].v = 0;
708
709 TILE(int, 1, N0, b_sum);
710 b_sum[0].v = 0;
711
712 int k;
713 for(k = 0; k <= K - K0; k += K0)
714 {
715 TILE(DATA_TYPE, M0, K0, a);
716 TILE(DATA_TYPE, N0, K0, b);
717
718 LOOP_UNROLLING(int, i, 0, 1, M0,
719 {
720 a[i].v = 0;
721 })
722
723 LOOP_UNROLLING(int, i, 0, 1, N0,
724 {
725 b[i].v = 0;
726 })
727
728 // Load tile from the lhs tensor in a transposed fashion
729 // see mat_mul_native_quantized_nt_nt main loop for more explanation
730 T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
731
732 // Load tile from the rhs tensor
733 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
734
735 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
736
737 LOOP_UNROLLING(int, i, 0, 1, K0,
738 {
739 LOOP_UNROLLING(int, j, 0, 1, M0,
740 {
741 a_sum[0].s[j] += (int)a[j].s[i];
742 })
743 })
744
745 LOOP_UNROLLING(int, i, 0, 1, N0,
746 {
747 LOOP_UNROLLING(int, j, 0, 1, K0,
748 {
749 b_sum[0].s[i] += (int)b[i].s[j];
750 })
751 })
752
753 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
754 rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
755 }
756
757#if((K % K0) != 0)
758 /* Leftover Loop */
759 for(; k < K; ++k)
760 {
761 TILE(DATA_TYPE, M0, 1, a);
762 TILE(DATA_TYPE, N0, 1, b);
763
764 LOOP_UNROLLING(int, i, 0, 1, M0,
765 {
766 a[i].v = 0;
767 })
768
769 LOOP_UNROLLING(int, i, 0, 1, N0,
770 {
771 b[i].v = 0;
772 })
773
774 // Load tile from the lhs tensor in a transposed fashion
775 // see mat_mul_native_quantized_nt_nt main loop for more explanation
776 T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
777
778 // Load tile from the rhs tensor
779 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
780
781 T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
782
783 LOOP_UNROLLING(int, i, 0, 1, 1,
784 {
785 LOOP_UNROLLING(int, j, 0, 1, M0,
786 {
787 a_sum[0].s[j] += (int)a[j].s[i];
788 })
789 })
790
791 LOOP_UNROLLING(int, i, 0, 1, N0,
792 {
793 LOOP_UNROLLING(int, j, 0, 1, 1,
794 {
795 b_sum[0].s[i] += (int)b[i].s[j];
796 })
797 })
798
799 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
800 rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
801 }
802#endif // ((K % K0) != 0)
803
804 LOOP_UNROLLING(int, i, 0, 1, M0,
805 {
806 LOOP_UNROLLING(int, j, 0, 1, N0,
807 {
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100808 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
Omar Al Khatib467daef2023-04-13 14:56:23 +0100809 })
810 })
811
812 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
813 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
814
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100815#ifdef BIAS
816 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
817#endif // defined(BIAS)
818
Omar Al Khatib467daef2023-04-13 14:56:23 +0100819 // Quantize the tile
820 TILE(DATA_TYPE, M0, N0, accq);
821 T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
822
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100823 T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
824
Omar Al Khatib467daef2023-04-13 14:56:23 +0100825 TILE(int, M0, 1, indirect_buffer);
826 LOOP_UNROLLING(int, _i, 0, 1, M0,
827 {
828 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
829 });
830
831 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
832}
833#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)