blob: c7ef8ae52bb90e7595df4c413e096b305f0a06b3 [file] [log] [blame]
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010024#include "activation_float_helpers.h"
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000025#include "helpers.h"
26#include "tile_helpers.h"
27
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010028#ifdef BIAS
29// This function performs in-place bias addition for float/half datatype when bias is enabled.
30// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
31inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
32{
33 TILE(DATA_TYPE, 1, N0, bias_tile);
34
35 // below expands to use bias_ptr and bias_offset_first_element_in_bytes
36 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
37
38 // c = c + bias[broadcasted]
39 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
40}
41#endif // defined(BIAS)
42
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000043#if defined(MAT_MUL_NATIVE_NT_NT)
44/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
45 *
46 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
47 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
Gunes Bayir8918b232023-03-17 13:52:21 +000048 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000049 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010050 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +000051 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000052 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Gunes Bayirbbeef722023-03-20 10:19:10 +000053 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +000054 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000055 * @note Only the following configurations of M0, N0 and K0 are currently supported:
56 * - M0 > 0
Gunes Bayirbbeef722023-03-20 10:19:10 +000057 * - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000058 * - K0 = 1, 2, 3, 4, 8, 16
59 * @note Values > 8 for M0 are not expected to be efficient
60 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010061 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
62 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
63 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
64 * @param[in] lhs_w The width of the lhs tensor
65 * @param[in] lhs_h The height of the lhs tensor
66 * @param[in] lhs_n Number of the matrices (buffers) in the batch
67 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
68 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
69 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
70 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
71 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
72 * @param[in] rhs_w The width of the rhs tensor
73 * @param[in] rhs_h The height of the rhs tensor
74 * @param[in] rhs_n Number of the matrices (buffers) in the batch
75 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
76 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
77 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
78 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
79 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
80 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
81 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
82 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
83 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
84 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
85 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
86 * @param[in] dst_w The width of the dst tensor
87 * @param[in] dst_h The height of the dst tensor
88 * @param[in] dst_n Number of the matrices (buffers) in the batch
89 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000090 */
91__kernel void mat_mul_native_nt_nt(
92 TENSOR3D_T(lhs, BUFFER),
Gunes Bayirbbeef722023-03-20 10:19:10 +000093 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010094#ifdef BIAS
95 TENSOR3D_T(bias, BUFFER),
96#endif // defined(BIAS)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000097 TENSOR3D_T(dst, BUFFER))
98{
99 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
100 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
101 const uint z = GET_SPATIAL_IDX(2, 1, 0);
102
103 // Compute LHS/RHS/DST matrix address
104 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000105 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
106
107 // Initialize the accumulators
108 TILE(DATA_TYPE, M0, N0, acc);
109
110 LOOP_UNROLLING(int, i, 0, 1, M0,
111 {
112 acc[i].v = 0.f;
113 })
114
Gunes Bayirbbeef722023-03-20 10:19:10 +0000115 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100116 int k;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000117 for(k = 0; k <= K - K0; k += K0)
118 {
119 TILE(DATA_TYPE, M0, K0, a);
120 TILE(DATA_TYPE, K0, N0, b);
121
122 LOOP_UNROLLING(int, i, 0, 1, M0,
123 {
124 a[i].v = 0.f;
125 })
126
127 LOOP_UNROLLING(int, i, 0, 1, K0,
128 {
129 b[i].v = 0.f;
130 })
131
132 // Load tile from the lhs/rhs tensors
133 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000134 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000135
136 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);
137
138 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000139 }
140
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100141#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000142 /* Leftover Loop */
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000143 for(; k < K; ++k)
144 {
145 TILE(DATA_TYPE, M0, 1, a);
146 TILE(DATA_TYPE, 1, N0, b);
147
148 LOOP_UNROLLING(int, i, 0, 1, M0,
149 {
150 a[i].v = 0.f;
151 })
152
153 LOOP_UNROLLING(int, i, 0, 1, 1,
154 {
155 b[i].v = 0.f;
156 })
157
158 // Load tile from the lhs/rhs tensors
159 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000160 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000161
162 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);
163
164 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000165 }
166#endif // K % K0 != 0
167
168 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
169 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
170
171 TILE(int, M0, 1, indirect_buffer);
172 LOOP_UNROLLING(int, _i, 0, 1, M0,
173 {
174 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
175 });
176
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100177#ifdef BIAS
178 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
179#endif // defined(BIAS)
180
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100181 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
182
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000183 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
184}
185#endif // defined(MAT_MUL_NATIVE_NT_NT)
186
187#if defined(MAT_MUL_NATIVE_NT_T)
188/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
189 *
190 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
191 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
Gunes Bayir8918b232023-03-17 13:52:21 +0000192 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000193 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100194 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000195 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000196 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Ramy Elgammalb531b752023-03-20 10:19:10 +0000197 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000198 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000199 * @note Only the following configurations of M0, N0 and K0 are currently supported:
200 * - M0 > 0
201 * - N0 = 1, 2, 3, 4, 8, 16
Ramy Elgammalb531b752023-03-20 10:19:10 +0000202 * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000203 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
204 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100205 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
206 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
207 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
208 * @param[in] lhs_w The width of the lhs tensor
209 * @param[in] lhs_h The height of the lhs tensor
210 * @param[in] lhs_n Number of the matrices (buffers) in the batch
211 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
212 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
213 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
214 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
215 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
216 * @param[in] rhs_w The width of the rhs tensor
217 * @param[in] rhs_h The height of the rhs tensor
218 * @param[in] rhs_n Number of the matrices (buffers) in the batch
219 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
220 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
221 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
222 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
223 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
224 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
225 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
226 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
227 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
228 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
229 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
230 * @param[in] dst_w The width of the dst tensor
231 * @param[in] dst_h The height of the dst tensor
232 * @param[in] dst_n Number of the matrices (buffers) in the batch
233 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000234 */
235__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
Ramy Elgammalb531b752023-03-20 10:19:10 +0000236 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100237#ifdef BIAS
238 TENSOR3D_T(bias, BUFFER),
239#endif // defined(BIAS)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000240 TENSOR3D_T(dst, BUFFER))
241
242{
243 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
244 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
245 const uint z = GET_SPATIAL_IDX(2, 1, 0);
246
247 // Compute LHS/RHS/DST matrix address
248 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000249 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
250
251 // Initialize the accumulators
252 TILE(DATA_TYPE, M0, N0, acc);
253
254 LOOP_UNROLLING(int, i, 0, 1, M0,
255 {
256 acc[i].v = 0.f;
257 })
258
Ramy Elgammalb531b752023-03-20 10:19:10 +0000259 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100260 int k;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000261 for(k = 0; k <= K - K0; k += K0)
262 {
263 TILE(DATA_TYPE, M0, K0, a);
264 TILE(DATA_TYPE, N0, K0, b);
265
266 LOOP_UNROLLING(int, i, 0, 1, M0,
267 {
268 a[i].v = 0.f;
269 })
270
271 LOOP_UNROLLING(int, i, 0, 1, N0,
272 {
273 b[i].v = 0.f;
274 })
275
276 // Load tile from the lhs/rhs tensors
277 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000278 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000279
280#if GPU_ARCH == GPU_ARCH_MIDGARD
281 // This part is written to decrease the number of loop unrollings caused
282 // by T_MMUL. The NT/NT version is partly vectorized and uses less number
283 // of loop unrollings, and code behaves as expected. Although this is not
284 // a performant solution for the specified architecture, it is necessary
285 // to overcome some limitations.
286 TILE(DATA_TYPE, K0, N0, bt);
287 LOOP_UNROLLING(int, i, 0, 1, N0,
288 {
289 LOOP_UNROLLING(int, j, 0, 1, K0,
290 {
291 bt[j].s[i] = b[i].s[j];
292 })
293 })
294 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
Gunes Bayir8918b232023-03-17 13:52:21 +0000295#else // GPU_ARCH == GPU_ARCH_MIDGARD
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000296 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);
297#endif // GPU_ARCH == GPU_ARCH_MIDGARD
298
299 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000300 }
301
302#if K % K0 != 0
303 /* Leftover Loop */
304 for(; k < K; ++k)
305 {
306 TILE(DATA_TYPE, M0, 1, a);
307 TILE(DATA_TYPE, N0, 1, b);
308
309 LOOP_UNROLLING(int, i, 0, 1, M0,
310 {
311 a[i].v = 0.f;
312 })
313
314 LOOP_UNROLLING(int, i, 0, 1, N0,
315 {
316 b[i].v = 0.f;
317 })
318
319 // Load tile from the lhs/rhs tensors
320 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000321 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000322
323#if GPU_ARCH == GPU_ARCH_MIDGARD
324 // See the main loop for the explanation of this part
325 TILE(DATA_TYPE, 1, N0, bt);
326 LOOP_UNROLLING(int, i, 0, 1, N0,
327 {
328 bt[0].s[i] = b[i].s[0];
329 })
330 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
Gunes Bayir8918b232023-03-17 13:52:21 +0000331#else // GPU_ARCH == GPU_ARCH_MIDGARD
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000332 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);
333#endif // GPU_ARCH == GPU_ARCH_MIDGARD
334
335 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000336 }
337#endif // K % K0 != 0
338
339 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
340 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
341
342 TILE(int, M0, 1, indirect_buffer);
343 LOOP_UNROLLING(int, _i, 0, 1, M0,
344 {
345 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
346 });
347
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100348#ifdef BIAS
349 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
350#endif // defined(BIAS)
351
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100352 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
353
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000354 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
355}
Gunes Bayir8918b232023-03-17 13:52:21 +0000356#endif // defined(MAT_MUL_NATIVE_NT_T)
357
358#if defined(MAT_MUL_NATIVE_T_NT)
359/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only
360 *
361 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
362 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
363 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
364 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100365 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000366 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
367 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Gunes Bayirbbeef722023-03-20 10:19:10 +0000368 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000369 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
370 * @note Only the following configurations of M0, N0 and K0 are currently supported:
371 * - M0 = 1, 2, 3, 4, 8, 16
Gunes Bayirbbeef722023-03-20 10:19:10 +0000372 * - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Gunes Bayir8918b232023-03-17 13:52:21 +0000373 * - K0 > 0
374 * * @note Values > 8 for M0, and K0 are not expected to be efficient
375 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100376 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
377 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
378 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
379 * @param[in] lhs_w The width of the lhs tensor
380 * @param[in] lhs_h The height of the lhs tensor
381 * @param[in] lhs_n Number of the matrices (buffers) in the batch
382 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
383 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
384 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
385 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
386 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
387 * @param[in] rhs_w The width of the rhs tensor
388 * @param[in] rhs_h The height of the rhs tensor
389 * @param[in] rhs_n Number of the matrices (buffers) in the batch
390 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
391 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
392 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
393 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
394 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
395 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
396 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
397 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
398 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
399 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
400 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
401 * @param[in] dst_w The width of the dst tensor
402 * @param[in] dst_h The height of the dst tensor
403 * @param[in] dst_n Number of the matrices (buffers) in the batch
404 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Gunes Bayir8918b232023-03-17 13:52:21 +0000405 */
406__kernel void mat_mul_native_t_nt(
407 TENSOR3D_T(lhs, BUFFER),
Gunes Bayirbbeef722023-03-20 10:19:10 +0000408 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100409#ifdef BIAS
410 TENSOR3D_T(bias, BUFFER),
411#endif // defined(BIAS)
Gunes Bayir8918b232023-03-17 13:52:21 +0000412 TENSOR3D_T(dst, BUFFER))
413{
414 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
415 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
416 const uint z = GET_SPATIAL_IDX(2, 1, 0);
417
418 // Compute LHS/RHS/DST matrix address
419 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
Gunes Bayir8918b232023-03-17 13:52:21 +0000420 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
421
422 // Initialize the accumulators
423 TILE(DATA_TYPE, M0, N0, acc);
424
425 LOOP_UNROLLING(int, i, 0, 1, M0,
426 {
427 acc[i].v = 0.f;
428 })
429
Gunes Bayirbbeef722023-03-20 10:19:10 +0000430 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100431 int k;
Gunes Bayir8918b232023-03-17 13:52:21 +0000432 for(k = 0; k <= K - K0; k += K0)
433 {
434 TILE(DATA_TYPE, K0, M0, a);
435 TILE(DATA_TYPE, K0, N0, b);
436
437 LOOP_UNROLLING(int, i, 0, 1, K0,
438 {
439 a[i].v = 0.f;
440 })
441
442 LOOP_UNROLLING(int, i, 0, 1, K0,
443 {
444 b[i].v = 0.f;
445 })
446
447 // Load tile from the lhs/rhs tensors
448 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000449 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000450
451#if GPU_ARCH == GPU_ARCH_MIDGARD
452 // For explanation, see mat_mul_native_nt_t
453 TILE(DATA_TYPE, M0, K0, at);
454 LOOP_UNROLLING(int, i, 0, 1, K0,
455 {
456 LOOP_UNROLLING(int, j, 0, 1, M0,
457 {
458 at[j].s[i] = a[i].s[j];
459 })
460 })
461 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);
462#else // GPU_ARCH == GPU_ARCH_MIDGARD
463 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);
464#endif // GPU_ARCH == GPU_ARCH_MIDGARD
465
466 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000467 }
468
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100469#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000470 /* Leftover Loop */
471 for(; k < K; ++k)
472 {
473 TILE(DATA_TYPE, 1, M0, a);
474 TILE(DATA_TYPE, 1, N0, b);
475
476 LOOP_UNROLLING(int, i, 0, 1, 1,
477 {
478 a[i].v = 0.f;
479 })
480
481 LOOP_UNROLLING(int, i, 0, 1, 1,
482 {
483 b[i].v = 0.f;
484 })
485
486 // Load tile from the lhs/rhs tensors
487 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000488 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000489
490#if GPU_ARCH == GPU_ARCH_MIDGARD
491 // For explanation, see mat_mul_native_nt_t
492 TILE(DATA_TYPE, M0, 1, at);
493 LOOP_UNROLLING(int, j, 0, 1, M0,
494 {
495 at[j].s[0] = a[0].s[j];
496 })
497 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);
498#else // GPU_ARCH == GPU_ARCH_MIDGARD
499 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);
500#endif // GPU_ARCH == GPU_ARCH_MIDGARD
501
502 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000503 }
504#endif // K % K0 != 0
505
506 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
507 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
508
509 TILE(int, M0, 1, indirect_buffer);
510 LOOP_UNROLLING(int, _i, 0, 1, M0,
511 {
512 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
513 });
514
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100515#ifdef BIAS
516 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
517#endif // defined(BIAS)
518
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100519 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
520
Gunes Bayir8918b232023-03-17 13:52:21 +0000521 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
522}
523#endif // defined(MAT_MUL_NATIVE_T_NT)
524
525#if defined(MAT_MUL_NATIVE_T_T)
526/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only
527 *
528 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
529 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
530 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
531 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100532 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000533 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
534 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Ramy Elgammalb531b752023-03-20 10:19:10 +0000535 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000536 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
537 * @note Only the following configurations of M0, N0 and K0 are currently supported:
538 * - M0 = 1, 2, 3, 4, 8, 16
539 * - N0 = 1, 2, 3, 4, 8, 16
Ramy Elgammalb531b752023-03-20 10:19:10 +0000540 * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Gunes Bayir8918b232023-03-17 13:52:21 +0000541 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
542 *
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100543 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
544 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
545 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
546 * @param[in] lhs_w The width of the lhs tensor
547 * @param[in] lhs_h The height of the lhs tensor
548 * @param[in] lhs_n Number of the matrices (buffers) in the batch
549 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
550 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
551 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
552 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
553 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
554 * @param[in] rhs_w The width of the rhs tensor
555 * @param[in] rhs_h The height of the rhs tensor
556 * @param[in] rhs_n Number of the matrices (buffers) in the batch
557 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
558 * @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
559 * @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
560 * @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
561 * @param[in] bias_w (Optional) The size of the width dimension of the bias tensor
562 * @param[in] bias_h (Optional) The size of the height dimension of the bias tensor
563 * @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor
564 * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
565 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
566 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
567 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
568 * @param[in] dst_w The width of the dst tensor
569 * @param[in] dst_h The height of the dst tensor
570 * @param[in] dst_n Number of the matrices (buffers) in the batch
571 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
Gunes Bayir8918b232023-03-17 13:52:21 +0000572 */
573__kernel void mat_mul_native_t_t(
574 TENSOR3D_T(lhs, BUFFER),
Ramy Elgammalb531b752023-03-20 10:19:10 +0000575 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100576#ifdef BIAS
577 TENSOR3D_T(bias, BUFFER),
578#endif // defined(BIAS)
Gunes Bayir8918b232023-03-17 13:52:21 +0000579 TENSOR3D_T(dst, BUFFER))
580{
581 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
582 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
583 const uint z = GET_SPATIAL_IDX(2, 1, 0);
584
585 // Compute LHS/RHS/DST matrix address
586 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
Gunes Bayir8918b232023-03-17 13:52:21 +0000587 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
588
589 // Initialize the accumulators
590 TILE(DATA_TYPE, M0, N0, acc);
591
592 LOOP_UNROLLING(int, i, 0, 1, M0,
593 {
594 acc[i].v = 0.f;
595 })
596
Ramy Elgammalb531b752023-03-20 10:19:10 +0000597 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100598 int k;
Gunes Bayir8918b232023-03-17 13:52:21 +0000599 for(k = 0; k <= K - K0; k += K0)
600 {
601 TILE(DATA_TYPE, K0, M0, a);
602 TILE(DATA_TYPE, N0, K0, b);
603
604 LOOP_UNROLLING(int, i, 0, 1, K0,
605 {
606 a[i].v = 0.f;
607 })
608
609 LOOP_UNROLLING(int, i, 0, 1, N0,
610 {
611 b[i].v = 0.f;
612 })
613
614 // Load tile from the lhs/rhs tensors
615 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000616 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000617#if GPU_ARCH == GPU_ARCH_MIDGARD
618 // For explanation, see mat_mul_native_nt_t
619 TILE(DATA_TYPE, M0, K0, at);
620 TILE(DATA_TYPE, K0, N0, bt);
621
622 LOOP_UNROLLING(int, i, 0, 1, K0,
623 {
624 LOOP_UNROLLING(int, j, 0, 1, M0,
625 {
626 at[j].s[i] = a[i].s[j];
627 })
628 })
629
630 LOOP_UNROLLING(int, i, 0, 1, N0,
631 {
632 LOOP_UNROLLING(int, j, 0, 1, K0,
633 {
634 bt[j].s[i] = b[i].s[j];
635 })
636 })
637
638 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
639#else // GPU_ARCH == GPU_ARCH_MIDGARD
640 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);
641#endif // GPU_ARCH == GPU_ARCH_MIDGARD
642
643 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000644 }
645
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100646#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000647 /* Leftover Loop */
648 for(; k < K; ++k)
649 {
650 TILE(DATA_TYPE, 1, M0, a);
651 TILE(DATA_TYPE, N0, 1, b);
652
653 LOOP_UNROLLING(int, i, 0, 1, 1,
654 {
655 a[i].v = 0.f;
656 })
657
658 LOOP_UNROLLING(int, i, 0, 1, N0,
659 {
660 b[i].v = 0.f;
661 })
662
663 // Load tile from the lhs/rhs tensors
664 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000665 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000666
667#if GPU_ARCH == GPU_ARCH_MIDGARD
668 // For explanation, see mat_mul_native_nt_t
669 TILE(DATA_TYPE, M0, 1, at);
670 TILE(DATA_TYPE, 1, N0, bt);
671
672 LOOP_UNROLLING(int, j, 0, 1, M0,
673 {
674 at[j].s[0] = a[0].s[j];
675 })
676
677 LOOP_UNROLLING(int, i, 0, 1, N0,
678 {
679 bt[0].s[i] = b[i].s[0];
680 })
681
682 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
683#else // GPU_ARCH == GPU_ARCH_MIDGARD
684 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);
685#endif // GPU_ARCH == GPU_ARCH_MIDGARD
686
687 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000688 }
689#endif // K % K0 != 0
690
691 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
692 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
693
694 TILE(int, M0, 1, indirect_buffer);
695 LOOP_UNROLLING(int, _i, 0, 1, M0,
696 {
697 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
698 });
699
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100700#ifdef BIAS
701 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
702#endif // defined(BIAS)
703
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100704 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
705
Gunes Bayir8918b232023-03-17 13:52:21 +0000706 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
707}
708#endif // defined(MAT_MUL_NATIVE_T_T)