Blame - src/core/CL/cl_kernels/common/mat_mul.cl - ml/ComputeLibrary

2023-03-17 13:52:21 +0000

[diff] [blame]

34

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

35

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

36

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

37

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

38

* @note Only the following configurations of M0, N0 and K0 are currently supported:

39

* - M0 > 0

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

40

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

41

* - K0 = 1, 2, 3, 4, 8, 16

42

* @note Values > 8 for M0 are not expected to be efficient

43

*

44

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

45

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

46

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

47

* @param[in] lhs_w The width of the lhs tensor

48

* @param[in] lhs_h The height of the lhs tensor

49

* @param[in] lhs_n Number of the matrices (buffers) in the batch

50

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

51

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

52

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

53

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

54

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

55

* @param[in] rhs_w The width of the rhs tensor

56

* @param[in] rhs_h The height of the rhs tensor

57

* @param[in] rhs_n Number of the matrices (buffers) in the batch

58

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

59

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

60

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

61

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

62

* @param[in] dst_w The width of the dst tensor

63

* @param[in] dst_h The height of the dst tensor

64

* @param[in] dst_n Number of the matrices (buffers) in the batch

65

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

66

*/

67

__kernel void mat_mul_native_nt_nt(

68

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

69

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

70

TENSOR3D_T(dst, BUFFER))

71

{

72

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

73

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

74

const uint z = GET_SPATIAL_IDX(2, 1, 0);

75

76

// Compute LHS/RHS/DST matrix address

77

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

78

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

79

80

// Initialize the accumulators

81

TILE(DATA_TYPE, M0, N0, acc);

82

83

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

88

const int rhs_z = z * rhs_h;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

89

int k;

90

for(k = 0; k <= K - K0; k += K0)

91

{

92

TILE(DATA_TYPE, M0, K0, a);

93

TILE(DATA_TYPE, K0, N0, b);

94

95

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

106

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

107

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

108

109

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);

110

111

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

112

}

113

114

#ifdef K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

115

/* Leftover Loop */

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

116

for(; k < K; ++k)

117

{

118

TILE(DATA_TYPE, M0, 1, a);

119

TILE(DATA_TYPE, 1, N0, b);

120

121

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

132

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

133

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

134

135

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);

136

137

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

138

}

139

#endif // K % K0 != 0

140

141

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

142

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

143

144

TILE(int, M0, 1, indirect_buffer);

145

LOOP_UNROLLING(int, _i, 0, 1, M0,

146

{

147

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

148

});

149

150

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

151

}

152

#endif // defined(MAT_MUL_NATIVE_NT_NT)

153

154

#if defined(MAT_MUL_NATIVE_NT_T)

155

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only

156

*

157

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

158

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

159

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

160

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

161

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

162

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

163

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

164

* @note Only the following configurations of M0, N0 and K0 are currently supported:

165

* - M0 > 0

166

* - N0 = 1, 2, 3, 4, 8, 16

167

* - K0 = 1, 2, 3, 4, 8, 16

168

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

169

*

170

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

171

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

172

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

173

* @param[in] lhs_w The width of the lhs tensor

174

* @param[in] lhs_h The height of the lhs tensor

175

* @param[in] lhs_n Number of the matrices (buffers) in the batch

176

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

177

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

178

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

179

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

180

* @param[in] rhs_w The width of the rhs tensor

181

* @param[in] rhs_h The height of the rhs tensor

182

* @param[in] rhs_n Number of the matrices (buffers) in the batch

183

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

184

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

185

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

186

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

187

* @param[in] dst_w The width of the dst tensor

188

* @param[in] dst_h The height of the dst tensor

189

* @param[in] dst_n Number of the matrices (buffers) in the batch

190

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

191

*/

192

__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),

193

TENSOR3D_T(rhs, BUFFER),

194

TENSOR3D_T(dst, BUFFER))

195

196

{

197

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

198

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

199

const uint z = GET_SPATIAL_IDX(2, 1, 0);

200

201

// Compute LHS/RHS/DST matrix address

202

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

203

rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;

204

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

205

206

// Initialize the accumulators

207

TILE(DATA_TYPE, M0, N0, acc);

208

209

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

int k;

for(k = 0; k <= K - K0; k += K0)

216

{

217

TILE(DATA_TYPE, M0, K0, a);

218

TILE(DATA_TYPE, N0, K0, b);

219

220

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

231

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

232

T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

233

234

#if GPU_ARCH == GPU_ARCH_MIDGARD

235

// This part is written to decrease the number of loop unrollings caused

236

// by T_MMUL. The NT/NT version is partly vectorized and uses less number

237

// of loop unrollings, and code behaves as expected. Although this is not

238

// a performant solution for the specified architecture, it is necessary

239

// to overcome some limitations.

240

TILE(DATA_TYPE, K0, N0, bt);

241

LOOP_UNROLLING(int, i, 0, 1, N0,

242

{

243

LOOP_UNROLLING(int, j, 0, 1, K0,

244

{

245

bt[j].s[i] = b[i].s[j];

246

})

247

})

248

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

249

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

250

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);

251

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

252

253

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

254

rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

}

#if K % K0 != 0

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

262

TILE(DATA_TYPE, N0, 1, b);

263

264

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

275

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

276

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

277

278

#if GPU_ARCH == GPU_ARCH_MIDGARD

279

// See the main loop for the explanation of this part

280

TILE(DATA_TYPE, 1, N0, bt);

281

LOOP_UNROLLING(int, i, 0, 1, N0,

282

{

283

bt[0].s[i] = b[i].s[0];

284

})

285

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

286

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

287

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);

288

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

289

290

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

291

rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

292

}

293

#endif // K % K0 != 0

294

295

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

296

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

297

298

TILE(int, M0, 1, indirect_buffer);

299

LOOP_UNROLLING(int, _i, 0, 1, M0,

300

{

301

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

302

});

303

304

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

305

}

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

306

#endif // defined(MAT_MUL_NATIVE_NT_T)

307

308

#if defined(MAT_MUL_NATIVE_T_NT)

309

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only

310

*

311

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

312

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

313

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

314

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

315

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

316

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

317

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

318

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)

319

* @note Only the following configurations of M0, N0 and K0 are currently supported:

320

* - M0 = 1, 2, 3, 4, 8, 16

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

321

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

322

* - K0 > 0

323

* * @note Values > 8 for M0, and K0 are not expected to be efficient

324

*

325

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

326

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

327

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

328

* @param[in] lhs_w The width of the lhs tensor

329

* @param[in] lhs_h The height of the lhs tensor

330

* @param[in] lhs_n Number of the matrices (buffers) in the batch

331

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

332

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

333

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

334

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

335

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

336

* @param[in] rhs_w The width of the rhs tensor

337

* @param[in] rhs_h The height of the rhs tensor

338

* @param[in] rhs_n Number of the matrices (buffers) in the batch

339

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

340

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

341

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

342

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

343

* @param[in] dst_w The width of the dst tensor

344

* @param[in] dst_h The height of the dst tensor

345

* @param[in] dst_n Number of the matrices (buffers) in the batch

346

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

347

*/

348

__kernel void mat_mul_native_t_nt(

349

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

350

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

351

TENSOR3D_T(dst, BUFFER))

352

{

353

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

354

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

355

const uint z = GET_SPATIAL_IDX(2, 1, 0);

356

357

// Compute LHS/RHS/DST matrix address

358

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

359

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

360

361

// Initialize the accumulators

362

TILE(DATA_TYPE, M0, N0, acc);

363

364

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

369

const int rhs_z = z * rhs_h;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

370

int k;

371

for(k = 0; k <= K - K0; k += K0)

372

{

373

TILE(DATA_TYPE, K0, M0, a);

374

TILE(DATA_TYPE, K0, N0, b);

375

376

LOOP_UNROLLING(int, i, 0, 1, K0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

387

T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

388

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

389

390

#if GPU_ARCH == GPU_ARCH_MIDGARD

391

// For explanation, see mat_mul_native_nt_t

392

TILE(DATA_TYPE, M0, K0, at);

393

LOOP_UNROLLING(int, i, 0, 1, K0,

394

{

395

LOOP_UNROLLING(int, j, 0, 1, M0,

396

{

397

at[j].s[i] = a[i].s[j];

398

})

399

})

400

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);

401

#else // GPU_ARCH == GPU_ARCH_MIDGARD

402

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);

403

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

404

405

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

}

#ifdef K % K0 != 0

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, 1, M0, a);

413

TILE(DATA_TYPE, 1, N0, b);

414

415

LOOP_UNROLLING(int, i, 0, 1, 1,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

426

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame^]

427

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

428

429

#if GPU_ARCH == GPU_ARCH_MIDGARD

430

// For explanation, see mat_mul_native_nt_t

431

TILE(DATA_TYPE, M0, 1, at);

432

LOOP_UNROLLING(int, j, 0, 1, M0,

433

{

434

at[j].s[0] = a[0].s[j];

435

})

436

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);

437

#else // GPU_ARCH == GPU_ARCH_MIDGARD

438

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);

439

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

440

441

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

Gunes Bayir