Blame - src/core/CL/cl_kernels/common/mat_mul_mmul.cl - ml/ComputeLibrary

2023-05-19 14:23:37 +0100

[diff] [blame]

36

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)

37

* @note Only the following configurations of M0, N0 and K0 are currently supported:

38

* - M0 > 0

39

* - N0 = 1, 2, 3, 4, 8, 16

40

* - K0 = 1

41

* @note Values > 8 for M0 are not expected to be efficient

42

*

43

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

44

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

45

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

46

* @param[in] lhs_w The width of the lhs tensor

47

* @param[in] lhs_h The height of the lhs tensor

48

* @param[in] lhs_n Number of the matrices (buffers) in the batch

49

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

50

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

51

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

52

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

53

* @param[in] rhs_w The width of the rhs tensor

54

* @param[in] rhs_h The height of the rhs tensor

55

* @param[in] rhs_n Number of the matrices (buffers) in the batch

56

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

57

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

58

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

59

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

60

* @param[in] dst_w The width of the dst tensor

61

* @param[in] dst_h The height of the dst tensor

62

* @param[in] dst_n Number of the matrices (buffers) in the batch

63

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

64

* @param[in] M Number of rows in LHS matrix

65

* @param[in] N Number of columns in RHS matrix

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

66

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

67

*/

68

__kernel void mat_mul_native_mmul_nt_nt(

69

TENSOR3D_T(lhs, BUFFER),

70

TENSOR3D_T(rhs, BUFFER),

71

TENSOR3D_T(dst, BUFFER),

72

const int M,

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

73

const int N,

74

const int K)

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

75

{

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

76

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

77

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

78

// The output/destination matrix is divided into "sections". Each section is filled by a group of

79

// threads of size MMUL_BLOCK_SIZE, bundled together according to GWS_x.

80

// Each thread writes to a tile of M0 x N0 (the usual output block size for a thread) in the output matrix.

81

// Therefore, the section dimensions are (MMUL_M0 x M0) x (MMUL_N0 x N0).

82

83

// The GWS is constructed in such a way that the y global id is the y section coordinate,

84

// and the x global id is a transformed thread id: MMUL_BLOCK_SIZE number of consecutive threads

85

// in the x dimension corresponding to a section.

86

// This can be visualized as first obtaining the coordinates of all the sections:

87

// x = [0, (N / N0) / MMUL_N0) --> (N / N0) / MMUL_N0 is the number of sections in x dimension

88

// y = [0, (M / M0) / MMUL_M0) --> (M / M0) / MMUL_M0 is the number of sections in y dimension

89

// Then multiply the x coordinates with MMUL_SECTION_NUM_THREADS to get the consecutive thread ids in the x dimension

90

// x = [0, ((N / N0) / MMUL_N0) * MMUL_N0 * MMUL_M0)

91

// x = [0, (N / N0) * MMUL_MO)

92

const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)

93

// The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)

94

const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

95

const uint z = get_global_id(2); // Batch

96

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

97

// Get section coordinates

98

const uint section_x = (x0 / MMUL_BLOCK_SIZE);

99

const uint section_y = y0;

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

100

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

101

// Within these sections, each thread writes onto a small output block of size M0 x N0

102

// in row major order. A section divided into tiles can be visualized as below.

103

//

104

// (Figure 1)

105

// A Section in the Output Matrix

106

//

107

// _____N0__________N0____________________N0____

// | | | | |

// | | | | |

// | | | | MMUL_N0 |

// |___________|__________|_________|___________|

113

// | | | |

114

// | | | |

115

// M0 | Thread | . | |

116

// | MMUL_N0+1 | . | | (M0 x MMUL_M0)

117

// |___________| . | |

// | . | |

// | . | |

// | . | |

// | |___________|

// | | |

// | | Thread |

// M0 | | MMUL_N0 x |

125

// | | MMUL_M0 |

126

// |________________________________|___________|

127

// N0 x MMUL_N0

128

//

129

// The output matrix has several of these sections. As shown above, each section

130

// will be filled by a separate thread group of size MMUL_BLOCK_SIZE. The overall

131

// section layout of the output matrix is as below. For instance, S(1,1) will be filled

132

// by MMUL_BLOCK_SIZE (possibly equal to 16) threads, so as S(0,1) and others.

//

// (Figure 2)

// DST Matrix

// ____________________________________

137

// | | | | |

138

// | S(0,0) | S(0,1) | ... | S(0, X) |

139

// |________|________|_______|_________|

140

// | | | | |

141

// | S(1,0) | S(1,1) | ... | S(1, X) |

142

// |________|________|_______|_________|

143

// | . | | |

144

// | . | | | Y = (M / M0) / MMUL_M0 - 1 (Max possible section y coordinate)

145

// | . | | | X = (N / N0) / MMUL_N0 - 1 (Max possible section x coordinate)

146

// |________|________|_________________|

147

// | | | | | S(y, x) denotes the section, and y and x are computed in

148

// | S(Y,0) | S(Y,1) | | S(Y, X) | section_y, section_x respectively.

149

// |________|________|_______|_________|

//

//

//

//

// A complete view involving the three matrices is given below. It examplifies how the section S(0,0) is computed.

//

// (Figure 3)

// Complete View

//

// LHS Matrix RHS Matrix DST Matrix

160

//

161

// ___MMUL_K0___________ __MMUL_N0 x N0____________ ___MMUL_N0 x N0____________________

162

// /|xxxxxxxxxx| | /|xxxxxxxxxxxxxxx| | /|xxxxxxxxxxxxxxxxxxx| |

163

// / |xxxxxxxxxx| | MMUK_K0 ||xxxxxxxxxxxxxxx| | / |xxxxxxxxxxxxxxxxxxx| |

164

// MMUL_M0 | |xxxxxxxxxx| ---> | ||xxxxxxxxxxxxxxx| . . . | MMUL_M0 | |xxxxxxxxxxxxxxxxxxx| |

165

// x M0 | |xxxxxxxxxx| | \|_______________|_________| x M0 | |xxxxxxxxxxxxxxxxxxx| ... |

166

167

// | |xxxxxxxxxx| | x | | | = \ |xxxxxxxxxxxxxxxxxxx| |

168

// \|__________|_________| | | | \|___________________| |

169

// | | | \/ | | |

170

// | , | |_________________________| | . |

171

// | , | | . |

172

// | , | | . |

173

// |____________________| |_________________________________|

174

//

175

// Horizontal and vertical arrows show the direction of K loop (main loop in the kernel).

176

// Each output section shown above is a zooomed out version of Figure 1.

177

//

178

// In each iteration of the main loop, LHS matrix traverses towards rightward, and RHS matrix traverses towards downward,

179

// the LHS section of (MMUL_M0 x M0) x MMUL_K0 and RHS section of MMUL_K0 x (MMUL_N0 x N0) is multiplied

180

// "cooperatively" using arm_matrix_multiply calls, and the result is accummulated over the output (DST) section

181

// of size (MMUL_M0 x M0) x (MMUL_N0 x N0) shown with 'x' signs.

182

//

183

// If it was a single thread, this multiplication would have been straightforward with a T_MMUL call.

184

// However, since it involves multiple threads working together using the aforementioned extension, it

185

// works slightly differently.

186

//

187

// Here is how threads access the LHS and RHS matrices:

188

// (Assume MMUL_K0 = MMUL_N0 = MMUL_M0 = 4 because the following diagram is heavily dependent on this)

189

//

190

// (Figure 4)

191

// Thread Access Layouts in LHS & RHS matrices

192

//

193

// LHS matrix RHS Matrix

194

// ___________________________ __________N0 times______N0 times____________________N0 times_______

195

// |__T0__|__T1__|__T2__|__T3__| |__T0__| ... |__T0__|__T1__| ... |__T1__| ... |__T3__| ... |__T3__|

196

// |__T0__| ... | |__T4__| ... |__T4__|__T5__| ... |__T5__| ... |__T7__| ... |__T7__|

197

// M0 | . . | |__T8__| ... |__T8__|__T9__| ... |__T9__| ... |__T11_| ... |__T11_|

198

// Times | . . | |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|

199

// | . . | X

200

// |__T0__|__T1__|__T2__|__T3__|

201

// |__T4__|__T5__|__T6__|__T7__|

202

// |__T4__|__T5__|__T6__|__T7__|

// M0 | . . |

// Times | . . |

// | . . |

// |__T4__|__T5__|__T6__|__T7__|

207

// |__T8__|__T9__|__T10_|__T11_|

// M0 | . |

// Times | . |

// | . |

// |__T12_|__T13_|__T14_|__T15_|

// M0 | . |

// Times | . |

// | . |

// |__T12_|__T13_|__T14_|__T15_|

216

//

217

//

218

// This access layout is designed such that the threads access continuous elements of each matrix (in terms of row/column).

219

// To multiply these large sections, the arm_matrix_multiply call is made for each of the M0xN0 elements. So, for each

220

// combination of m0 and n0 (iterators of M0 and N0 from 0 to M0-1 and N0-1 respectively), one arm_matrix_multiply call is

221

// made, and MMUL_BLOCK_SIZE number of threads compute the result.

222

//

223

// The matrix multiplication taking place in this extension

224

// is an "interleaved" one, because, for example, if m0=0 and n0=0, i.e. the first iteration, we would use the first,

225

// M0-th, 2M0-th and 3M0-th rows of the LHS matrix. Similarly, we jump N0 steps in the RHS matrix. This is how we access

226

// one element for each thread in a single (m0, n0) loop.

227

//

228

// For example, if we have

229

// - a 8 x 4 LHS section

230

// - 4 x 8 RHS section

231

// - Each vector variable ai, bj represent a 4x1 vector

232

// - ^T (superscript T) denotes transpose

233

// - M0 = N0 = 2

234

// - MMUL_N0 = MMUL_M0 = MMUL_K0 = 4

235

//

236

// (Figure 5)

237

// Mathematical view of the Matrix Multiplication

238

//

239

// LHS RHS DST

240

// [ a1^T ] [ b1 b2 b3 b4 b5 b6 b7 ] [ a1^Tb1 a1^Tb2 a1^Tb3 ... a1^Tb7 ]

241

// [ a2^T ] 4 x 8 [ a2^Tb1 a2^Tb2 a2^Tb3 ... a2^Tb7 ]

242

// [ a3^T ] [ ]

243

// [ a4^T ] = [ . . ]

244

// [ a5^T ] X [ . . ]

245

// [ a6^T ] [ . . ]

246

// [ a7^T ] [ ]

247

// [ a8^T ] [ a7^Tb1 a7^Tb2 a7^Tb3 ... a7^Tb7 ]

// 8 x 4 8 x 8

//

//

// For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:

252

//

253

// [ a1^T ] [ b1 b3 b5 b7 ] [ a1^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

254

// [ a3^T ] x 4 x 4 = [ a3^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

255

// [ a5^T ] [ a5^Tb1 a1^Tb3 a1^Tb5 a1^Tb7 ]

256

// [ a7^T ] [ a7^Tb1 a7^Tb3 a7^Tb5 a7^Tb7 ]

257

// 4 x 4 4 x 4

258

// The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.

259

// When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.

260

//

261

262

// Get thread coordinates within an mmul block (of size MMUL_BLOCK_SIZE)

263

// Since threads are grouped in x dimension, the modular of x-dim global id

264

// wrt the MMUL_BLOCK_SIZE is the thread id in the group, ranging from 0 to

265

// MMUL_BLOCK_SIZE-1. Because the thread numbering is in row-major order.

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

266

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

267

const uint thread_x = thread_id % MMUL_N0;

268

const uint thread_y = (thread_id / MMUL_N0);

269

270

// Starting destination coordinates

271

// Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication

272

// part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results

273

// Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

274

// The unclamped dst coordinates can be calculated easily from the output section coordinates and the thread coordinates (see above figure).

275

276

// See Figure 1 & 2. Thread step size is N0 and M0,

277

// Section step size is N0 x MMUL_N0 and M0 x MMUL_M0

278

// respectively for x, y dimensions.

279

const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;

280

const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;

SiCong Li

2023-05-19 14:23:37 +0100

[diff] [blame]

281

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

282

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

283

284

// Starting LHS coordinates

285

const uint lhs_x = thread_x;

286

const uint lhs_y = dst_y;

287

288

// Starting RHS coordinates

289

const uint rhs_x = dst_x;

290

const uint rhs_y = thread_y;

291

292

// Compute LHS/RHS/DST matrix address

293

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

294

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

295

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

296

297

// Initialize the accumulators

298

// MMUL extension accumulate the result in F32 for both F32 and F16

299

TILE(float, M0, N0, c_f32);

300

301

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

307

{

308

// A tile of M0xK0 but K0 must be set to 1

309

TILE(DATA_TYPE, M0, 1, a);

310

// A tile of K0xN0 but K0 must be set to 1

311

TILE(DATA_TYPE, 1, N0, b);

312

313

// Load tile from the lhs/rhs tensors

314

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

315

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

316

317

LOOP_UNROLLING(int, m0, 0, 1, M0,

318

{

319

LOOP_UNROLLING(int, n0, 0, 1, N0,

320

{

321

c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

326

rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;

327

}

328

329

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

330

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

336

TILE(DATA_TYPE, M0, N0, c);

337

338

// Conversion required for the half precision

339

LOOP_UNROLLING(int, m0, 0, 1, M0,

340

{

341

LOOP_UNROLLING(int, n0, 0, 1, N0,

342

{

343

c[m0].s[n0] = c_f32[m0].s[n0];

344

})

345

})

346

#else // defined(HALF_PRECISION)

347

#define c c_f32

348

#endif // defined(HALF_PRECISION)

349

350

if(dst_x + N0 <= N || N0_LEFTOVER == 0)

351

{

352

LOOP_UNROLLING(int, m0, 0, 1, M0,

353

{

354

if(dst_y + m0 < M || M0_LEFTOVER == 0)

355

{

356

VSTORE(N0)

357

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

else

{

LOOP_UNROLLING(int, m0, 0, 1, M0,

364

{

365

if(dst_y + m0 < M || M0_LEFTOVER == 0)

366

{

367

VSTORE_PARTIAL(N0, N0_LEFTOVER)

368

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

#undef MMUL_BLOCK_SIZE

374

}

375

#endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

376

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

377

#if defined(MAT_MUL_NATIVE_MMUL_T_NT)

378

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS transposed, RHS non-transposed - buffer only

379

*

380

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

381

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

382

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

383

* @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).

384

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

385

* @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).

386

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0

387

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_T_NT)

388

* @note Only the following configurations of M0, N0 and K0 are currently supported:

389

* - M0 = 1, 2, 3, 4, 8, 16

390

* - N0 = 1, 2, 3, 4, 8, 16

391

* - K0 = 1

392

* @note Values > 8 for M0 are not expected to be efficient

393

*

394

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

395

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

396

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

397

* @param[in] lhs_w The width of the lhs tensor

398

* @param[in] lhs_h The height of the lhs tensor

399

* @param[in] lhs_n Number of the matrices (buffers) in the batch

400

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

401

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

402

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

403

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

404

* @param[in] rhs_w The width of the rhs tensor

405

* @param[in] rhs_h The height of the rhs tensor

406

* @param[in] rhs_n Number of the matrices (buffers) in the batch

407

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

408

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

409

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

410

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

411

* @param[in] dst_w The width of the dst tensor

412

* @param[in] dst_h The height of the dst tensor

413

* @param[in] dst_n Number of the matrices (buffers) in the batch

414

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

415

* @param[in] M Number of rows in DST matrix

416

* @param[in] N Number of columns in DST matrix

417

* @param[in] K Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.

418

*/

419

__kernel void mat_mul_native_mmul_t_nt(

420

TENSOR3D_T(lhs, BUFFER),

421

TENSOR3D_T(rhs, BUFFER),

422

TENSOR3D_T(dst, BUFFER),

const int M,

const int N,

const int K)

{

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)

428

// For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.

429

430

const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)

431

// The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)

432

const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)

433

const uint z = get_global_id(2); // Batch

434

435

// Get section coordinates

436

const uint section_x = (x0 / MMUL_BLOCK_SIZE);

437

const uint section_y = y0;

438

439

// Get thread coordinates

440

uint thread_id = (x0 % MMUL_BLOCK_SIZE);

441

uint thread_x = thread_id % MMUL_N0;

442

uint thread_y = (thread_id / MMUL_N0);

443

444

// See Nt/Nt kernel for explanations

445

const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;

446

const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;

447

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

448

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

449

450

// Starting LHS coordinates

451

uint lhs_x = dst_y;

452

uint lhs_y = thread_x;

453

454

// Starting RHS coordinates

455

uint rhs_x = dst_x;

456

uint rhs_y = thread_y;

457

458

// Compute LHS/RHS/DST matrix address

459

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

460

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

461

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

462

463

// Initialize the accumulators

464

// MMUL extension accumulate the result in F32 for both F32 and F16

465

TILE(float, M0, N0, c_f32);

466

467

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

473

{

474

TILE(DATA_TYPE, 1, M0, a);

475

TILE(DATA_TYPE, 1, N0, b);

476

477

// Load tile from the lhs/rhs tensors

478

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

479

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

480

481

LOOP_UNROLLING(int, m0, 0, 1, M0,

482

{

483

LOOP_UNROLLING(int, n0, 0, 1, N0,

484

{

485

c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[0].s[n0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;

490

rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;

491

}

492

493

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

494

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

500

TILE(DATA_TYPE, M0, N0, c);

501

502

// Conversion required for the half precision

503

LOOP_UNROLLING(int, m0, 0, 1, M0,

504

{

505

LOOP_UNROLLING(int, n0, 0, 1, N0,

506

{

507

c[m0].s[n0] = c_f32[m0].s[n0];

508

})

509

})

510

#else // defined(HALF_PRECISION)

511

#define c c_f32

512

#endif // defined(HALF_PRECISION)

513

514

if(dst_x + N0 <= N || N0_LEFTOVER == 0)

515

{

516

LOOP_UNROLLING(int, m0, 0, 1, M0,

517

{

518

if(dst_y + m0 < M || M0_LEFTOVER == 0)

519

{

520

VSTORE(N0)

521

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

else

{

LOOP_UNROLLING(int, m0, 0, 1, M0,

528

{

529

if(dst_y + m0 < M || M0_LEFTOVER == 0)

530

{

531

VSTORE_PARTIAL(N0, N0_LEFTOVER)

532

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

#undef MMUL_BLOCK_SIZE

538

}

539

#endif // defined(MAT_MUL_NATIVE_MMUL_T_NT)

540

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

541

#if defined(MAT_MUL_NATIVE_MMUL_NT_T)

542

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only

543

*

544

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

545

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

546

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

547

* @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).

548

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

549

* @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).

550

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)

551

* @note Only the following configurations of M0, N0 and K0 are currently supported:

552

* - M0 > 0

553

* - N0 = 1, 2, 3, 4, 8, 16

554

* - K0 = 1

555

* @note Values > 8 for M0 are not expected to be efficient

556

*

557

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

558

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

559

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

560

* @param[in] lhs_w The width of the lhs tensor

561

* @param[in] lhs_h The height of the lhs tensor

562

* @param[in] lhs_n Number of the matrices (buffers) in the batch

563

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

564

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

565

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

566

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

567

* @param[in] rhs_w The width of the rhs tensor

568

* @param[in] rhs_h The height of the rhs tensor

569

* @param[in] rhs_n Number of the matrices (buffers) in the batch

570

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

571

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

572

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

573

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

574

* @param[in] dst_w The width of the dst tensor

575

* @param[in] dst_h The height of the dst tensor

576

* @param[in] dst_n Number of the matrices (buffers) in the batch

577

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

578

* @param[in] M Number of rows in LHS matrix

579

* @param[in] N Number of columns in RHS matrix

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

580

* @param[in] K Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

581

*/

582

__kernel void mat_mul_native_mmul_nt_t(

583

TENSOR3D_T(lhs, BUFFER),

584

TENSOR3D_T(rhs, BUFFER),

585

TENSOR3D_T(dst, BUFFER),

const int M,

const int N,

const int K)

{

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

591

// For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

592

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

593

const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)

594

// The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)

595

const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

596

const uint z = get_global_id(2); // Batch

597

598

// Get block coordinates

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

599

const uint section_x = (x0 / MMUL_BLOCK_SIZE);

600

const uint section_y = y0;

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

601

602

// Get thread coordinates within a block

603

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

604

const uint thread_x = thread_id % MMUL_N0;

605

const uint thread_y = (thread_id / MMUL_N0);

606

607

// Starting destination coordinates

608

// Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication

609

// part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results

610

// Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.

Gunes Bayir

2023-06-19 21:33:51 +0100

[diff] [blame^]

611

const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;

612

const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;

Ramy Elgammal

2023-05-19 14:23:37 +0100

[diff] [blame]

613

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

614

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

615

616

// Starting LHS coordinates

617

const uint lhs_x = thread_x;

618

const uint lhs_y = dst_y;

619

620

// Starting RHS coordinates

621

const uint rhs_x = thread_y;

622

const uint rhs_y = dst_x;

623

624

// Compute LHS/RHS/DST matrix address

625

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

626

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

627

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

628

629

// Initialize the accumulators

630

// MMUL extension accumulate the result in F32 for both F32 and F16

631

TILE(float, M0, N0, c_f32);

632

633

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

639

{

640

// A tile of M0xK0 but K0 must be set to 1

641

TILE(DATA_TYPE, M0, 1, a);

642

// A tile of N0xK0 but K0 must be set to 1

643

TILE(DATA_TYPE, N0, 1, b);

644

645

// Load tile from the lhs/rhs tensors

646

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

647

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

648

649

LOOP_UNROLLING(int, m0, 0, 1, M0,

650

{

651

LOOP_UNROLLING(int, n0, 0, 1, N0,

652

{

653

c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

658

rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);

659

}

660

661

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

662

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

668

TILE(DATA_TYPE, M0, N0, c);

669

670

// Conversion required for the half precision

671

LOOP_UNROLLING(int, m0, 0, 1, M0,

672

{

673

LOOP_UNROLLING(int, n0, 0, 1, N0,

674

{

675

c[m0].s[n0] = c_f32[m0].s[n0];

676

})

677

})

678

#else // defined(HALF_PRECISION)

679

#define c c_f32

680

#endif // defined(HALF_PRECISION)

681

682

if(dst_x + N0 <= N || N0_LEFTOVER == 0)

683

{

684

LOOP_UNROLLING(int, m0, 0, 1, M0,

685

{

686

if(dst_y + m0 < M || M0_LEFTOVER == 0)

687

{

688

VSTORE(N0)

689

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

else

{

LOOP_UNROLLING(int, m0, 0, 1, M0,

696

{

697

if(dst_y + m0 < M || M0_LEFTOVER == 0)

698

{

699

VSTORE_PARTIAL(N0, N0_LEFTOVER)

700

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

#undef MMUL_BLOCK_SIZE

706

}

707

#endif // defined(MAT_MUL_NATIVE_MMUL_NT_T)

Gunes Bayir