blob: 9656a59728e698f79ff54bb0509d96b22f77c514 [file] [log] [blame]
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010024#include "activation_float_helpers.h"
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000025#include "helpers.h"
26#include "tile_helpers.h"
27
28#if defined(MAT_MUL_NATIVE_NT_NT)
29/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
30 *
31 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
32 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
Gunes Bayir8918b232023-03-17 13:52:21 +000033 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000034 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010035 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +000036 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000037 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Gunes Bayirbbeef722023-03-20 10:19:10 +000038 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +000039 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000040 * @note Only the following configurations of M0, N0 and K0 are currently supported:
41 * - M0 > 0
Gunes Bayirbbeef722023-03-20 10:19:10 +000042 * - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000043 * - K0 = 1, 2, 3, 4, 8, 16
44 * @note Values > 8 for M0 are not expected to be efficient
45 *
46 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
47 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
48 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
49 * @param[in] lhs_w The width of the lhs tensor
50 * @param[in] lhs_h The height of the lhs tensor
51 * @param[in] lhs_n Number of the matrices (buffers) in the batch
52 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
Gunes Bayirbbeef722023-03-20 10:19:10 +000053 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
Gunes Bayir8918b232023-03-17 13:52:21 +000054 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000055 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
56 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
57 * @param[in] rhs_w The width of the rhs tensor
58 * @param[in] rhs_h The height of the rhs tensor
59 * @param[in] rhs_n Number of the matrices (buffers) in the batch
60 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
Gunes Bayir8918b232023-03-17 13:52:21 +000061 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000062 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
63 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
64 * @param[in] dst_w The width of the dst tensor
65 * @param[in] dst_h The height of the dst tensor
66 * @param[in] dst_n Number of the matrices (buffers) in the batch
67 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
68 */
69__kernel void mat_mul_native_nt_nt(
70 TENSOR3D_T(lhs, BUFFER),
Gunes Bayirbbeef722023-03-20 10:19:10 +000071 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000072 TENSOR3D_T(dst, BUFFER))
73{
74 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
75 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
76 const uint z = GET_SPATIAL_IDX(2, 1, 0);
77
78 // Compute LHS/RHS/DST matrix address
79 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000080 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
81
82 // Initialize the accumulators
83 TILE(DATA_TYPE, M0, N0, acc);
84
85 LOOP_UNROLLING(int, i, 0, 1, M0,
86 {
87 acc[i].v = 0.f;
88 })
89
Gunes Bayirbbeef722023-03-20 10:19:10 +000090 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010091 int k;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +000092 for(k = 0; k <= K - K0; k += K0)
93 {
94 TILE(DATA_TYPE, M0, K0, a);
95 TILE(DATA_TYPE, K0, N0, b);
96
97 LOOP_UNROLLING(int, i, 0, 1, M0,
98 {
99 a[i].v = 0.f;
100 })
101
102 LOOP_UNROLLING(int, i, 0, 1, K0,
103 {
104 b[i].v = 0.f;
105 })
106
107 // Load tile from the lhs/rhs tensors
108 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000109 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000110
111 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);
112
113 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000114 }
115
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100116#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000117 /* Leftover Loop */
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000118 for(; k < K; ++k)
119 {
120 TILE(DATA_TYPE, M0, 1, a);
121 TILE(DATA_TYPE, 1, N0, b);
122
123 LOOP_UNROLLING(int, i, 0, 1, M0,
124 {
125 a[i].v = 0.f;
126 })
127
128 LOOP_UNROLLING(int, i, 0, 1, 1,
129 {
130 b[i].v = 0.f;
131 })
132
133 // Load tile from the lhs/rhs tensors
134 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000135 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000136
137 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);
138
139 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000140 }
141#endif // K % K0 != 0
142
143 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
144 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
145
146 TILE(int, M0, 1, indirect_buffer);
147 LOOP_UNROLLING(int, _i, 0, 1, M0,
148 {
149 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
150 });
151
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100152 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
153
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000154 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
155}
156#endif // defined(MAT_MUL_NATIVE_NT_NT)
157
158#if defined(MAT_MUL_NATIVE_NT_T)
159/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
160 *
161 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
162 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
Gunes Bayir8918b232023-03-17 13:52:21 +0000163 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000164 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100165 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000166 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000167 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Ramy Elgammalb531b752023-03-20 10:19:10 +0000168 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000169 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000170 * @note Only the following configurations of M0, N0 and K0 are currently supported:
171 * - M0 > 0
172 * - N0 = 1, 2, 3, 4, 8, 16
Ramy Elgammalb531b752023-03-20 10:19:10 +0000173 * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000174 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
175 *
176 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
177 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
178 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
179 * @param[in] lhs_w The width of the lhs tensor
180 * @param[in] lhs_h The height of the lhs tensor
181 * @param[in] lhs_n Number of the matrices (buffers) in the batch
182 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
Ramy Elgammalb531b752023-03-20 10:19:10 +0000183 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
Gunes Bayir8918b232023-03-17 13:52:21 +0000184 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000185 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
186 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
187 * @param[in] rhs_w The width of the rhs tensor
188 * @param[in] rhs_h The height of the rhs tensor
189 * @param[in] rhs_n Number of the matrices (buffers) in the batch
190 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
Gunes Bayir8918b232023-03-17 13:52:21 +0000191 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000192 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
193 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
194 * @param[in] dst_w The width of the dst tensor
195 * @param[in] dst_h The height of the dst tensor
196 * @param[in] dst_n Number of the matrices (buffers) in the batch
197 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
198 */
199__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
Ramy Elgammalb531b752023-03-20 10:19:10 +0000200 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000201 TENSOR3D_T(dst, BUFFER))
202
203{
204 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
205 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
206 const uint z = GET_SPATIAL_IDX(2, 1, 0);
207
208 // Compute LHS/RHS/DST matrix address
209 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000210 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
211
212 // Initialize the accumulators
213 TILE(DATA_TYPE, M0, N0, acc);
214
215 LOOP_UNROLLING(int, i, 0, 1, M0,
216 {
217 acc[i].v = 0.f;
218 })
219
Ramy Elgammalb531b752023-03-20 10:19:10 +0000220 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100221 int k;
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000222 for(k = 0; k <= K - K0; k += K0)
223 {
224 TILE(DATA_TYPE, M0, K0, a);
225 TILE(DATA_TYPE, N0, K0, b);
226
227 LOOP_UNROLLING(int, i, 0, 1, M0,
228 {
229 a[i].v = 0.f;
230 })
231
232 LOOP_UNROLLING(int, i, 0, 1, N0,
233 {
234 b[i].v = 0.f;
235 })
236
237 // Load tile from the lhs/rhs tensors
238 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000239 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000240
241#if GPU_ARCH == GPU_ARCH_MIDGARD
242 // This part is written to decrease the number of loop unrollings caused
243 // by T_MMUL. The NT/NT version is partly vectorized and uses less number
244 // of loop unrollings, and code behaves as expected. Although this is not
245 // a performant solution for the specified architecture, it is necessary
246 // to overcome some limitations.
247 TILE(DATA_TYPE, K0, N0, bt);
248 LOOP_UNROLLING(int, i, 0, 1, N0,
249 {
250 LOOP_UNROLLING(int, j, 0, 1, K0,
251 {
252 bt[j].s[i] = b[i].s[j];
253 })
254 })
255 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
Gunes Bayir8918b232023-03-17 13:52:21 +0000256#else // GPU_ARCH == GPU_ARCH_MIDGARD
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000257 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);
258#endif // GPU_ARCH == GPU_ARCH_MIDGARD
259
260 lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000261 }
262
263#if K % K0 != 0
264 /* Leftover Loop */
265 for(; k < K; ++k)
266 {
267 TILE(DATA_TYPE, M0, 1, a);
268 TILE(DATA_TYPE, N0, 1, b);
269
270 LOOP_UNROLLING(int, i, 0, 1, M0,
271 {
272 a[i].v = 0.f;
273 })
274
275 LOOP_UNROLLING(int, i, 0, 1, N0,
276 {
277 b[i].v = 0.f;
278 })
279
280 // Load tile from the lhs/rhs tensors
281 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000282 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000283
284#if GPU_ARCH == GPU_ARCH_MIDGARD
285 // See the main loop for the explanation of this part
286 TILE(DATA_TYPE, 1, N0, bt);
287 LOOP_UNROLLING(int, i, 0, 1, N0,
288 {
289 bt[0].s[i] = b[i].s[0];
290 })
291 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
Gunes Bayir8918b232023-03-17 13:52:21 +0000292#else // GPU_ARCH == GPU_ARCH_MIDGARD
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000293 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);
294#endif // GPU_ARCH == GPU_ARCH_MIDGARD
295
296 lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000297 }
298#endif // K % K0 != 0
299
300 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
301 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
302
303 TILE(int, M0, 1, indirect_buffer);
304 LOOP_UNROLLING(int, _i, 0, 1, M0,
305 {
306 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
307 });
308
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100309 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
310
Ramy Elgammal2b6ebfe2023-03-09 21:15:37 +0000311 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
312}
Gunes Bayir8918b232023-03-17 13:52:21 +0000313#endif // defined(MAT_MUL_NATIVE_NT_T)
314
315#if defined(MAT_MUL_NATIVE_T_NT)
316/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only
317 *
318 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
319 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
320 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
321 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100322 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000323 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
324 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Gunes Bayirbbeef722023-03-20 10:19:10 +0000325 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000326 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
327 * @note Only the following configurations of M0, N0 and K0 are currently supported:
328 * - M0 = 1, 2, 3, 4, 8, 16
Gunes Bayirbbeef722023-03-20 10:19:10 +0000329 * - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Gunes Bayir8918b232023-03-17 13:52:21 +0000330 * - K0 > 0
331 * * @note Values > 8 for M0, and K0 are not expected to be efficient
332 *
333 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
334 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
335 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
336 * @param[in] lhs_w The width of the lhs tensor
337 * @param[in] lhs_h The height of the lhs tensor
338 * @param[in] lhs_n Number of the matrices (buffers) in the batch
339 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
Gunes Bayirbbeef722023-03-20 10:19:10 +0000340 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
Gunes Bayir8918b232023-03-17 13:52:21 +0000341 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
342 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
343 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
344 * @param[in] rhs_w The width of the rhs tensor
345 * @param[in] rhs_h The height of the rhs tensor
346 * @param[in] rhs_n Number of the matrices (buffers) in the batch
347 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
348 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
349 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
350 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
351 * @param[in] dst_w The width of the dst tensor
352 * @param[in] dst_h The height of the dst tensor
353 * @param[in] dst_n Number of the matrices (buffers) in the batch
354 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
355 */
356__kernel void mat_mul_native_t_nt(
357 TENSOR3D_T(lhs, BUFFER),
Gunes Bayirbbeef722023-03-20 10:19:10 +0000358 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Gunes Bayir8918b232023-03-17 13:52:21 +0000359 TENSOR3D_T(dst, BUFFER))
360{
361 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
362 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
363 const uint z = GET_SPATIAL_IDX(2, 1, 0);
364
365 // Compute LHS/RHS/DST matrix address
366 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
Gunes Bayir8918b232023-03-17 13:52:21 +0000367 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
368
369 // Initialize the accumulators
370 TILE(DATA_TYPE, M0, N0, acc);
371
372 LOOP_UNROLLING(int, i, 0, 1, M0,
373 {
374 acc[i].v = 0.f;
375 })
376
Gunes Bayirbbeef722023-03-20 10:19:10 +0000377 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100378 int k;
Gunes Bayir8918b232023-03-17 13:52:21 +0000379 for(k = 0; k <= K - K0; k += K0)
380 {
381 TILE(DATA_TYPE, K0, M0, a);
382 TILE(DATA_TYPE, K0, N0, b);
383
384 LOOP_UNROLLING(int, i, 0, 1, K0,
385 {
386 a[i].v = 0.f;
387 })
388
389 LOOP_UNROLLING(int, i, 0, 1, K0,
390 {
391 b[i].v = 0.f;
392 })
393
394 // Load tile from the lhs/rhs tensors
395 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000396 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000397
398#if GPU_ARCH == GPU_ARCH_MIDGARD
399 // For explanation, see mat_mul_native_nt_t
400 TILE(DATA_TYPE, M0, K0, at);
401 LOOP_UNROLLING(int, i, 0, 1, K0,
402 {
403 LOOP_UNROLLING(int, j, 0, 1, M0,
404 {
405 at[j].s[i] = a[i].s[j];
406 })
407 })
408 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);
409#else // GPU_ARCH == GPU_ARCH_MIDGARD
410 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);
411#endif // GPU_ARCH == GPU_ARCH_MIDGARD
412
413 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000414 }
415
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100416#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000417 /* Leftover Loop */
418 for(; k < K; ++k)
419 {
420 TILE(DATA_TYPE, 1, M0, a);
421 TILE(DATA_TYPE, 1, N0, b);
422
423 LOOP_UNROLLING(int, i, 0, 1, 1,
424 {
425 a[i].v = 0.f;
426 })
427
428 LOOP_UNROLLING(int, i, 0, 1, 1,
429 {
430 b[i].v = 0.f;
431 })
432
433 // Load tile from the lhs/rhs tensors
434 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Gunes Bayirbbeef722023-03-20 10:19:10 +0000435 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000436
437#if GPU_ARCH == GPU_ARCH_MIDGARD
438 // For explanation, see mat_mul_native_nt_t
439 TILE(DATA_TYPE, M0, 1, at);
440 LOOP_UNROLLING(int, j, 0, 1, M0,
441 {
442 at[j].s[0] = a[0].s[j];
443 })
444 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);
445#else // GPU_ARCH == GPU_ARCH_MIDGARD
446 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);
447#endif // GPU_ARCH == GPU_ARCH_MIDGARD
448
449 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000450 }
451#endif // K % K0 != 0
452
453 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
454 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
455
456 TILE(int, M0, 1, indirect_buffer);
457 LOOP_UNROLLING(int, _i, 0, 1, M0,
458 {
459 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
460 });
461
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100462 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
463
Gunes Bayir8918b232023-03-17 13:52:21 +0000464 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
465}
466#endif // defined(MAT_MUL_NATIVE_T_NT)
467
468#if defined(MAT_MUL_NATIVE_T_T)
469/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only
470 *
471 * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
472 * should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
473 * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
474 * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100475 * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
Gunes Bayir8918b232023-03-17 13:52:21 +0000476 * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
477 * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
Ramy Elgammalb531b752023-03-20 10:19:10 +0000478 * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
Gunes Bayir8918b232023-03-17 13:52:21 +0000479 * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
480 * @note Only the following configurations of M0, N0 and K0 are currently supported:
481 * - M0 = 1, 2, 3, 4, 8, 16
482 * - N0 = 1, 2, 3, 4, 8, 16
Ramy Elgammalb531b752023-03-20 10:19:10 +0000483 * - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
Gunes Bayir8918b232023-03-17 13:52:21 +0000484 * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
485 *
486 * @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16
487 * @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)
488 * @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)
489 * @param[in] lhs_w The width of the lhs tensor
490 * @param[in] lhs_h The height of the lhs tensor
491 * @param[in] lhs_n Number of the matrices (buffers) in the batch
492 * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
Ramy Elgammalb531b752023-03-20 10:19:10 +0000493 * @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
Gunes Bayir8918b232023-03-17 13:52:21 +0000494 * @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
495 * @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)
496 * @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)
497 * @param[in] rhs_w The width of the rhs tensor
498 * @param[in] rhs_h The height of the rhs tensor
499 * @param[in] rhs_n Number of the matrices (buffers) in the batch
500 * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
501 * @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
502 * @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)
503 * @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)
504 * @param[in] dst_w The width of the dst tensor
505 * @param[in] dst_h The height of the dst tensor
506 * @param[in] dst_n Number of the matrices (buffers) in the batch
507 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
508 */
509__kernel void mat_mul_native_t_t(
510 TENSOR3D_T(lhs, BUFFER),
Ramy Elgammalb531b752023-03-20 10:19:10 +0000511 TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
Gunes Bayir8918b232023-03-17 13:52:21 +0000512 TENSOR3D_T(dst, BUFFER))
513{
514 const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
515 const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
516 const uint z = GET_SPATIAL_IDX(2, 1, 0);
517
518 // Compute LHS/RHS/DST matrix address
519 lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
Gunes Bayir8918b232023-03-17 13:52:21 +0000520 dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
521
522 // Initialize the accumulators
523 TILE(DATA_TYPE, M0, N0, acc);
524
525 LOOP_UNROLLING(int, i, 0, 1, M0,
526 {
527 acc[i].v = 0.f;
528 })
529
Ramy Elgammalb531b752023-03-20 10:19:10 +0000530 const int rhs_z = z * rhs_h;
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100531 int k;
Gunes Bayir8918b232023-03-17 13:52:21 +0000532 for(k = 0; k <= K - K0; k += K0)
533 {
534 TILE(DATA_TYPE, K0, M0, a);
535 TILE(DATA_TYPE, N0, K0, b);
536
537 LOOP_UNROLLING(int, i, 0, 1, K0,
538 {
539 a[i].v = 0.f;
540 })
541
542 LOOP_UNROLLING(int, i, 0, 1, N0,
543 {
544 b[i].v = 0.f;
545 })
546
547 // Load tile from the lhs/rhs tensors
548 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000549 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000550#if GPU_ARCH == GPU_ARCH_MIDGARD
551 // For explanation, see mat_mul_native_nt_t
552 TILE(DATA_TYPE, M0, K0, at);
553 TILE(DATA_TYPE, K0, N0, bt);
554
555 LOOP_UNROLLING(int, i, 0, 1, K0,
556 {
557 LOOP_UNROLLING(int, j, 0, 1, M0,
558 {
559 at[j].s[i] = a[i].s[j];
560 })
561 })
562
563 LOOP_UNROLLING(int, i, 0, 1, N0,
564 {
565 LOOP_UNROLLING(int, j, 0, 1, K0,
566 {
567 bt[j].s[i] = b[i].s[j];
568 })
569 })
570
571 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
572#else // GPU_ARCH == GPU_ARCH_MIDGARD
573 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);
574#endif // GPU_ARCH == GPU_ARCH_MIDGARD
575
576 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000577 }
578
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100579#if K % K0 != 0
Gunes Bayir8918b232023-03-17 13:52:21 +0000580 /* Leftover Loop */
581 for(; k < K; ++k)
582 {
583 TILE(DATA_TYPE, 1, M0, a);
584 TILE(DATA_TYPE, N0, 1, b);
585
586 LOOP_UNROLLING(int, i, 0, 1, 1,
587 {
588 a[i].v = 0.f;
589 })
590
591 LOOP_UNROLLING(int, i, 0, 1, N0,
592 {
593 b[i].v = 0.f;
594 })
595
596 // Load tile from the lhs/rhs tensors
597 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
Ramy Elgammalb531b752023-03-20 10:19:10 +0000598 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
Gunes Bayir8918b232023-03-17 13:52:21 +0000599
600#if GPU_ARCH == GPU_ARCH_MIDGARD
601 // For explanation, see mat_mul_native_nt_t
602 TILE(DATA_TYPE, M0, 1, at);
603 TILE(DATA_TYPE, 1, N0, bt);
604
605 LOOP_UNROLLING(int, j, 0, 1, M0,
606 {
607 at[j].s[0] = a[0].s[j];
608 })
609
610 LOOP_UNROLLING(int, i, 0, 1, N0,
611 {
612 bt[0].s[i] = b[i].s[0];
613 })
614
615 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
616#else // GPU_ARCH == GPU_ARCH_MIDGARD
617 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);
618#endif // GPU_ARCH == GPU_ARCH_MIDGARD
619
620 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
Gunes Bayir8918b232023-03-17 13:52:21 +0000621 }
622#endif // K % K0 != 0
623
624 const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
625 const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
626
627 TILE(int, M0, 1, indirect_buffer);
628 LOOP_UNROLLING(int, _i, 0, 1, M0,
629 {
630 indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
631 });
632
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100633 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
634
Gunes Bayir8918b232023-03-17 13:52:21 +0000635 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
636}
637#endif // defined(MAT_MUL_NATIVE_T_T)