Blame - src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl - ml/ComputeLibrary

Gunes Bayir

e87fa66

2023-09-07 12:20:33 +0100

[diff] [blame]

/*

*

* SPDX-License-Identifier: MIT

5

*

6

* Permission is hereby granted, free of charge, to any person obtaining a copy

7

* of this software and associated documentation files (the "Software"), to

8

* deal in the Software without restriction, including without limitation the

9

* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

10

* sell copies of the Software, and to permit persons to whom the Software is

11

* furnished to do so, subject to the following conditions:

12

*

13

* The above copyright notice and this permission notice shall be included in all

14

* copies or substantial portions of the Software.

15

*

16

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

17

* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

18

* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

19

* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

20

* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

21

* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

22

* SOFTWARE.

23

*/

24

#include "activation_float_helpers.h"

25

#include "helpers.h"

26

#include "tile_helpers.h"

27

28

#ifdef BIAS

29

// This function performs in-place bias addition for integer datatype when bias is enabled.

30

// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).

31

inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)

32

{

33

TILE(int, 1, N0, bias_tile);

34

35

// below expands to use bias_ptr and bias_offset_first_element_in_bytes

36

T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);

37

38

// c = c + bias[broadcasted]

39

T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);

40

}

41

#endif // defined(BIAS)

42

Gunes Bayir

a116cd3

2023-09-13 11:59:34 +0100

[diff] [blame^]

43

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix

44

Gunes Bayir

e87fa66

2023-09-07 12:20:33 +0100

[diff] [blame]

45

#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)

46

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only

47

*

Gunes Bayir

a116cd3

2023-09-13 11:59:34 +0100

[diff] [blame^]

48

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

49

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

50

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)

51

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at

52

* compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

53

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER

54

* (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

55

* @note The dimensions M, N, K must be passed at compile time using -DK (e.g. -DM=5, -DN=8, -DK=6).

56

* K must be a multiple of 16.

57

* @note MMUL block sizes must be passed at compile time using -DMMUL_K0, -DMMUL_M0, -DMMUL_N0

58

* (e.g. -DMMUL_K0=16, -DMMUL_M0=4, -DMMUL_N0=4)

59

* @note If there is bias -DBIAS option must be passed at compile time

60

* @note Quantization offsets of lhs, rhs and dst tensors must be passed at compile time using -DLHS_OFFSET,

61

* -DRHS_OFFSET, -DDST_OFFSET (e.g. -DLHS_OFFSET=10, -DRHS_OFFSET=0, -DDST_OFFSET=-6)

62

* @note Effective quantization multiplier and shift for the destination tensor must be passed at compile time using

63

* -DDST_MULTIPLIER and -DDST_SHIFT (e.g. -DDST_MULTIPLIER=2091, -DST_SHIFT=8)

64

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)

65

* @note Only the following configurations of M0, N0 and K0 are currently supported:

66

* - M0 > 0

67

* - N0 = 1, 2, 3, 4, 8, 16

68

* - K0 = 4

69

* @note For a generic view on how the MMUL works, see mat_mul_mmul.cl

Gunes Bayir

e87fa66

2023-09-07 12:20:33 +0100

[diff] [blame]

70

*

71

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8

72

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

73

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

74

* @param[in] lhs_w The width of the lhs tensor

75

* @param[in] lhs_h The height of the lhs tensor

76

* @param[in] lhs_n Number of the matrices (buffers) in the batch

77

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

78

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

79

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

80

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

81

* @param[in] rhs_w The width of the rhs tensor

82

* @param[in] rhs_h The height of the rhs tensor

83

* @param[in] rhs_n Number of the matrices (buffers) in the batch

84

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

Gunes Bayir

a116cd3

2023-09-13 11:59:34 +0100

[diff] [blame^]

85

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: S32

Gunes Bayir

e87fa66

2023-09-07 12:20:33 +0100

[diff] [blame]

86

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

87

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

88

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

89

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

90

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

91

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

92

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

93

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

94

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

95

* @param[in] dst_w The width of the dst tensor

96

* @param[in] dst_h The height of the dst tensor

97

* @param[in] dst_n Number of the matrices (buffers) in the batch

98

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

99

*/

100

__kernel void mat_mul_native_quantized_mmul_nt_nt(

101

TENSOR3D_T(lhs, BUFFER),

102

TENSOR3D_T(rhs, BUFFER),

103

#ifdef BIAS

104

TENSOR3D_T(bias, BUFFER),

105

#endif // defined(BIAS)

106

TENSOR3D_T(dst, BUFFER))

107

{

Gunes Bayir

a116cd3

2023-09-13 11:59:34 +0100

[diff] [blame^]

108

// The explanation of how this kernel works is very similar to the explanation given in

109

// mat_mul_mmul.cl. The MMUL logic, and terminology is the same. The only difference is

110

// in quantization multiplication, the MMUL block sizes are (4 x 16) for Lhs matrix and

111

// (16 x 4) for Rhs matrix, resulting in (4 x 4) MMUL block size for the destination.

112

//

113

// Figures 1, 2 and 3 in the previous explanation works the same. Since the Lhs and Rhs

114

// MMUL block sizes are different in quantized extension, the thread access pattern is

115

// slightly different. We can redraw Figure 4 (Thread access pattern) as follows:

116

//

117

// (Modified Figure 4 from mat_mul_mmul.cl)

118

// Thread Access Layouts in LHS & RHS matrices

119

//

120

// LHS matrix

121

// 4 times 4 times 4 times 4 times

122

// _______________________________________________________________

123

// |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|

// |T0_| ... |

// M0 | . . |

// Times | . . |

// | . . |

// |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|

129

// |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|

130

// |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|

// M0 | . . |

// Times | . . |

// | . . |

// |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|

135

// |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|

// M0 | . |

// Times | . |

// | . |

// |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|

// M0 | . |

// Times | . |

// | . |

// |T12|T12|T12|T12|T13|T13|T13|T13|T14|T14|T14|T14|T15|T15|T15|T15|

//

//

// RHS Matrix

//

// __________N0 times______N0 times____________________N0 times_______

149

// |__T0__| ... |__T0__|__T1__| ... |__T1__| ... |__T3__| ... |__T3__|

150

// 4 times |__T0__| ... |__T0__|__T1__| ... |__T1__| ... |__T3__| ... |__T3__|

151

// |__T0__| ... |__T0__|__T1__| ... |__T1__| ... |__T3__| ... |__T3__|

152

// |__T0__| ... |__T0__|__T1__| ... |__T1__| ... |__T3__| ... |__T3__|

153

// |__T4__| ... |__T4__|__T5__| ... |__T5__| ... |__T7__| ... |__T7__|

154

// 4 times |__T4__| ... |__T4__|__T5__| ... |__T5__| ... |__T7__| ... |__T7__|

155

// |__T4__| ... |__T4__|__T5__| ... |__T5__| ... |__T7__| ... |__T7__|

156

// X |__T4__| ... |__T4__|__T5__| ... |__T5__| ... |__T7__| ... |__T7__|

157

// |__T8__| ... |__T8__|__T9__| ... |__T9__| ... |__T11_| ... |__T11_|

158

// |__T8__| ... |__T8__|__T9__| ... |__T9__| ... |__T11_| ... |__T11_|

159

// 4 times |__T8__| ... |__T8__|__T9__| ... |__T9__| ... |__T11_| ... |__T11_|

160

// |__T8__| ... |__T8__|__T9__| ... |__T9__| ... |__T11_| ... |__T11_|

161

// |__T12_| ... |__T12_|__T13_| ... |__T13_| ... |__T15_| ... |__T15_|

162

// 4 times |__T12_| ... |__T12_|__T13_| ... |__T13_| ... |__T15_| ... |__T15_|

163

// |__T12_| ... |__T12_|__T13_| ... |__T13_| ... |__T15_| ... |__T15_|

164

// |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|

165

//

166

//

167

// The logic behind this thread access pattern is already descried in the explanation

168

// in mat_mul_mmul.cl. The only change is threads accesses are extended to 4 elements

169

// from 1, in rightward direction in Lhs, and in downward direction in Rhs, because they

170

// are now operating on 4 char/uchar's (again 32-bit data), instead of one 32-bit floating point.

171

//

172

// The mathematical view of the matrix multiplication explained in Figure 5 also holds for this,

173

// except the dimension 4 is 16 instead, but the vector notations do not change, i.e. it's as follows:

174

//

175

// Settings:

176

// - a 8 x 16 LHS section

177

// - 16 x 8 RHS section

178

// - Each vector variable ai, bj represent a 16x1 vector

179

// - ^T (superscript T) denotes transpose

180

// - M0 = N0 = 2

181

// - MMUL_N0 = MMUL_M0 = 4, MMUL_K0 = 16

182

//

183

//

184

// (Modified Figure 5)

185

// Mathematical view of the Matrix Multiplication

186

//

187

// LHS RHS DST

188

// [ a1^T ] [ b1 b2 b3 b4 b5 b6 b7 ] [ a1^Tb1 a1^Tb2 a1^Tb3 ... a1^Tb7 ]

189

// [ a2^T ] 16 x 8 [ a2^Tb1 a2^Tb2 a2^Tb3 ... a2^Tb7 ]

190

// [ a3^T ] [ ]

191

// [ a4^T ] = [ . . ]

192

// [ a5^T ] X [ . . ]

193

// [ a6^T ] [ . . ]

194

// [ a7^T ] [ ]

195

// [ a8^T ] [ a7^Tb1 a7^Tb2 a7^Tb3 ... a7^Tb7 ]

// 8 x 16 8 x 8

//

//

// For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:

200

//

201

// [ a1^T ] [ b1 b3 b5 b7 ] [ a1^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

202

// [ a3^T ] x 4 x 4 = [ a3^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

203

// [ a5^T ] [ a5^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

204

// [ a7^T ] [ a7^Tb1 a7^Tb3 a7^Tb5 a7^Tb7 ]

205

// 4 x 4 4 x 4

206

// The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.

207

// When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.

208

//

209

// Please refer to mat_mul_mmul.cl for more details.

210

211

const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)

212

// The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)

213

const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)

214

const uint z = get_global_id(2); // Batch

215

216

// Get section coordinates

217

const uint section_x = (x0 / MMUL_BLOCK_SIZE);

218

const uint section_y = y0;

219

220

// Get thread coordinates within an mmul block

221

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

222

const uint thread_x = thread_id % MMUL_N0;

223

const uint thread_y = (thread_id / MMUL_N0);

224

225

// Calculate dst coordinates

226

const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;

227

const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;

228

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

229

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

230

231

// Starting LHS coordinates

232

const uint lhs_x = K0 * thread_x;

233

const uint lhs_y = dst_y;

234

235

// Starting RHS coordinates

236

const uint rhs_x = dst_x;

237

const uint rhs_y = K0 * thread_y;

238

239

// Compute LHS/RHS/DST matrix address

240

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

241

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

242

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

243

244

// Initialize the accumulators

245

TILE(int, M0, N0, c);

246

LOOP_UNROLLING(int, i, 0, 1, M0,

247

{

248

c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);

249

})

250

251

// Calculate row and column sums

252

TILE(int, 1, N0, b_sum);

253

b_sum[0].v = 0;

254

255

TILE(int, 1, M0, a_sum);

256

a_sum[0].v = 0;

257

258

VEC_DATA_TYPE(DATA_TYPE, K0)

259

vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);

260

261

for(int k = 0; k < lhs_w; k += MMUL_K0)

262

{

263

// A tile of M0xK0 but K0 must be set to K0

264

TILE(DATA_TYPE, M0, K0, a);

265

// A tile of K0xN0 but K0 must be set to K0

266

TILE(DATA_TYPE, K0, N0, b);

267

268

// Load tile from the lhs/rhs tensors

269

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

270

T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

271

272

LOOP_UNROLLING(int, m0, 0, 1, M0,

273

{

274

LOOP_UNROLLING(int, n0, 0, 1, N0,

275

{

276

VEC_DATA_TYPE(DATA_TYPE, K0)

277

vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);

278

c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);

})

})

#if RHS_OFFSET != 0

// Row Sum of A: Calculate the sum of rows by multiplying A with

284

// a matrix of 1's from Right

285

LOOP_UNROLLING(int, m0, 0, 1, M0,

286

{

287

a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);

288

})

289

#endif // RHS_OFFSET != 0

290

291

#if LHS_OFFSET != 0

292

// Column Sum of B: Calculate the sum of columns by multiplying B

293

// with a matrix of 1's from Left

294

LOOP_UNROLLING(int, n0, 0, 1, N0,

295

{

296

VEC_DATA_TYPE(DATA_TYPE, K0)

297

vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);

298

b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);

299

})

300

#endif // LHS_OFFSET != 0

301

302

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

303

rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;

304

}

305

306

// Do not write if the coordinates are out of bound

307

// But, read has to happen as arm_matrix_multiply() expects certain number of calls

308

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if RHS_OFFSET != 0 || LHS_OFFSET != 0

314

LOOP_UNROLLING(int, i, 0, 1, M0,

315

{

316

const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];

317

LOOP_UNROLLING(int, j, 0, 1, N0,

318

{

319

c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];

320

})

321

})

322

#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0

323

324

#ifdef BIAS

325

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);

326

#endif // defined(BIAS)

327

328

// Quantize the tile

329

TILE(DATA_TYPE, M0, N0, cq);

330

T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);

331

332

if(dst_x + N0 <= N || N0_LEFTOVER == 0)

333

{

334

LOOP_UNROLLING(int, m0, 0, 1, M0,

335

{

336

if(dst_y + m0 < M || M0_LEFTOVER == 0)

337

{

338

VSTORE(N0)

339

(cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

else

{

LOOP_UNROLLING(int, m0, 0, 1, M0,

346

{

347

if(dst_y + m0 < M || M0_LEFTOVER == 0)

348

{

349

VSTORE_PARTIAL(N0, N0_LEFTOVER)

350

(cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

351

}

352

})

353

}

Gunes Bayir

e87fa66

2023-09-07 12:20:33 +0100

[diff] [blame]

354

}

355

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)

356

357

#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)

358

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only

359

*

360

* Supported block configurations:

361

* TODO: Report supported M0, N0, K0

362

*

363

* Similar to mat_mul_native_quantized_mmul_nt_nt()

364

*/

365

__kernel void mat_mul_native_quantized_mmul_nt_t(

366

TENSOR3D_T(lhs, BUFFER),

367

TENSOR3D_T(rhs, BUFFER),

368

#ifdef BIAS

369

TENSOR3D_T(bias, BUFFER),

370

#endif // defined(BIAS)

371

TENSOR3D_T(dst, BUFFER))

372

{

373

}

374

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)

375

376

#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)

377

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed

378

*

379

* Supported block configurations:

380

* TODO: Report supported M0, N0, K0

381

*

382

* Similar to mat_mul_native_quantized_mmul_nt_nt()

383

*/

384

__kernel void mat_mul_native_quantized_mmul_t_nt(

385

TENSOR3D_T(lhs, BUFFER),

386

TENSOR3D_T(rhs, BUFFER),

387

#ifdef BIAS

388

TENSOR3D_T(bias, BUFFER),

389

#endif // defined(BIAS)

390

TENSOR3D_T(dst, BUFFER))

391

{

392

}

393

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)

394

395

#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)

396

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed

397

*

398

* Supported block configurations:

399

* TODO: Report supported M0, N0, K0

400

*

401

* Similar to mat_mul_native_quantized_mmul_nt_nt()

402

*/

403

__kernel void mat_mul_native_quantized_mmul_t_t(

404

TENSOR3D_T(lhs, BUFFER),

405

TENSOR3D_T(rhs, BUFFER),

406

#ifdef BIAS

407

TENSOR3D_T(bias, BUFFER),

408

#endif // defined(BIAS)

409

TENSOR3D_T(dst, BUFFER))

410

{

411

}

412

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)