Blame - src/core/CL/cl_kernels/common/mat_mul.cl - ml/ComputeLibrary

2023-05-25 16:48:43 +0100

[diff] [blame^]

35

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

36

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

37

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

38

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

39

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

40

* @note Only the following configurations of M0, N0 and K0 are currently supported:

41

* - M0 > 0

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

42

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

43

* - K0 = 1, 2, 3, 4, 8, 16

44

* @note Values > 8 for M0 are not expected to be efficient

45

*

46

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

47

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

48

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

49

* @param[in] lhs_w The width of the lhs tensor

50

* @param[in] lhs_h The height of the lhs tensor

51

* @param[in] lhs_n Number of the matrices (buffers) in the batch

52

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

53

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

54

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

55

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

56

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

57

* @param[in] rhs_w The width of the rhs tensor

58

* @param[in] rhs_h The height of the rhs tensor

59

* @param[in] rhs_n Number of the matrices (buffers) in the batch

60

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

61

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

62

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

63

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

64

* @param[in] dst_w The width of the dst tensor

65

* @param[in] dst_h The height of the dst tensor

66

* @param[in] dst_n Number of the matrices (buffers) in the batch

67

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

68

*/

69

__kernel void mat_mul_native_nt_nt(

70

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

71

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

72

TENSOR3D_T(dst, BUFFER))

73

{

74

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

75

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

76

const uint z = GET_SPATIAL_IDX(2, 1, 0);

77

78

// Compute LHS/RHS/DST matrix address

79

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

80

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

81

82

// Initialize the accumulators

83

TILE(DATA_TYPE, M0, N0, acc);

84

85

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

90

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

91

int k;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

92

for(k = 0; k <= K - K0; k += K0)

93

{

94

TILE(DATA_TYPE, M0, K0, a);

95

TILE(DATA_TYPE, K0, N0, b);

96

97

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

108

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

109

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

110

111

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);

112

113

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

114

}

115

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

116

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

117

/* Leftover Loop */

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

118

for(; k < K; ++k)

119

{

120

TILE(DATA_TYPE, M0, 1, a);

121

TILE(DATA_TYPE, 1, N0, b);

122

123

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

134

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

135

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

136

137

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);

138

139

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

140

}

141

#endif // K % K0 != 0

142

143

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

144

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

145

146

TILE(int, M0, 1, indirect_buffer);

147

LOOP_UNROLLING(int, _i, 0, 1, M0,

148

{

149

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

150

});

151

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

152

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

153

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

154

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

155

}

156

#endif // defined(MAT_MUL_NATIVE_NT_NT)

157

158

#if defined(MAT_MUL_NATIVE_NT_T)

159

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only

160

*

161

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

162

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

163

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

164

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

165

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

166

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

167

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

168

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

169

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

170

* @note Only the following configurations of M0, N0 and K0 are currently supported:

171

* - M0 > 0

172

* - N0 = 1, 2, 3, 4, 8, 16

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

173

* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

174

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

175

*

176

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

177

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

178

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

179

* @param[in] lhs_w The width of the lhs tensor

180

* @param[in] lhs_h The height of the lhs tensor

181

* @param[in] lhs_n Number of the matrices (buffers) in the batch

182

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

183

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

184

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

185

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

186

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

187

* @param[in] rhs_w The width of the rhs tensor

188

* @param[in] rhs_h The height of the rhs tensor

189

* @param[in] rhs_n Number of the matrices (buffers) in the batch

190

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

191

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

192

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

193

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

194

* @param[in] dst_w The width of the dst tensor

195

* @param[in] dst_h The height of the dst tensor

196

* @param[in] dst_n Number of the matrices (buffers) in the batch

197

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

198

*/

199

__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

200

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

201

TENSOR3D_T(dst, BUFFER))

202

203

{

204

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

205

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

206

const uint z = GET_SPATIAL_IDX(2, 1, 0);

207

208

// Compute LHS/RHS/DST matrix address

209

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

210

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

211

212

// Initialize the accumulators

213

TILE(DATA_TYPE, M0, N0, acc);

214

215

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

220

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

221

int k;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

222

for(k = 0; k <= K - K0; k += K0)

223

{

224

TILE(DATA_TYPE, M0, K0, a);

225

TILE(DATA_TYPE, N0, K0, b);

226

227

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

238

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

239

T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

240

241

#if GPU_ARCH == GPU_ARCH_MIDGARD

242

// This part is written to decrease the number of loop unrollings caused

243

// by T_MMUL. The NT/NT version is partly vectorized and uses less number

244

// of loop unrollings, and code behaves as expected. Although this is not

245

// a performant solution for the specified architecture, it is necessary

246

// to overcome some limitations.

247

TILE(DATA_TYPE, K0, N0, bt);

248

LOOP_UNROLLING(int, i, 0, 1, N0,

249

{

250

LOOP_UNROLLING(int, j, 0, 1, K0,

251

{

252

bt[j].s[i] = b[i].s[j];

253

})

254

})

255

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

256

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

257

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);

258

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

259

260

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

}

#if K % K0 != 0

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

268

TILE(DATA_TYPE, N0, 1, b);

269

270

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

281

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

282

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

283

284

#if GPU_ARCH == GPU_ARCH_MIDGARD

285

// See the main loop for the explanation of this part

286

TILE(DATA_TYPE, 1, N0, bt);

287

LOOP_UNROLLING(int, i, 0, 1, N0,

288

{

289

bt[0].s[i] = b[i].s[0];

290

})

291

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

292

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

293

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);

294

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

295

296

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

297

}

298

#endif // K % K0 != 0

299

300

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

301

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

302

303

TILE(int, M0, 1, indirect_buffer);

304

LOOP_UNROLLING(int, _i, 0, 1, M0,

305

{

306

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

307

});

308

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

309

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

310

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

311

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

312

}

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

313

#endif // defined(MAT_MUL_NATIVE_NT_T)

314

315

#if defined(MAT_MUL_NATIVE_T_NT)

316

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only

317

*

318

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

319

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

320

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

321

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

322

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

323

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

324

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

325

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

326

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)

327

* @note Only the following configurations of M0, N0 and K0 are currently supported:

328

* - M0 = 1, 2, 3, 4, 8, 16

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

329

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

330

* - K0 > 0

331

* * @note Values > 8 for M0, and K0 are not expected to be efficient

332

*

333

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

334

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

335

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

336

* @param[in] lhs_w The width of the lhs tensor

337

* @param[in] lhs_h The height of the lhs tensor

338

* @param[in] lhs_n Number of the matrices (buffers) in the batch

339

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

340

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

341

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

342

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

343

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

344

* @param[in] rhs_w The width of the rhs tensor

345

* @param[in] rhs_h The height of the rhs tensor

346

* @param[in] rhs_n Number of the matrices (buffers) in the batch

347

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

348

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

349

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

350

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

351

* @param[in] dst_w The width of the dst tensor

352

* @param[in] dst_h The height of the dst tensor

353

* @param[in] dst_n Number of the matrices (buffers) in the batch

354

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

355

*/

356

__kernel void mat_mul_native_t_nt(

357

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

358

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

359

TENSOR3D_T(dst, BUFFER))

360

{

361

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

362

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

363

const uint z = GET_SPATIAL_IDX(2, 1, 0);

364

365

// Compute LHS/RHS/DST matrix address

366

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

367

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

368

369

// Initialize the accumulators

370

TILE(DATA_TYPE, M0, N0, acc);

371

372

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

377

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

378

int k;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

379

for(k = 0; k <= K - K0; k += K0)

380

{

381

TILE(DATA_TYPE, K0, M0, a);

382

TILE(DATA_TYPE, K0, N0, b);

383

384

LOOP_UNROLLING(int, i, 0, 1, K0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

395

T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

396

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

397

398

#if GPU_ARCH == GPU_ARCH_MIDGARD

399

// For explanation, see mat_mul_native_nt_t

400

TILE(DATA_TYPE, M0, K0, at);

401

LOOP_UNROLLING(int, i, 0, 1, K0,

402

{

403

LOOP_UNROLLING(int, j, 0, 1, M0,

404

{

405

at[j].s[i] = a[i].s[j];

406

})

407

})

408

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);

409

#else // GPU_ARCH == GPU_ARCH_MIDGARD

410

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);

411

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

412

413

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

414

}

415

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

416

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, 1, M0, a);

421

TILE(DATA_TYPE, 1, N0, b);

422

423

LOOP_UNROLLING(int, i, 0, 1, 1,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

434

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

435

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

436

437

#if GPU_ARCH == GPU_ARCH_MIDGARD

438

// For explanation, see mat_mul_native_nt_t

439

TILE(DATA_TYPE, M0, 1, at);

440

LOOP_UNROLLING(int, j, 0, 1, M0,

441

{

442

at[j].s[0] = a[0].s[j];

443

})

444

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);

445

#else // GPU_ARCH == GPU_ARCH_MIDGARD

446

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);

447

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

448

449

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

450

}

451

#endif // K % K0 != 0

452

453

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

454

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

455

456

TILE(int, M0, 1, indirect_buffer);

457

LOOP_UNROLLING(int, _i, 0, 1, M0,

458

{

459

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

460

});

461

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

462

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

463

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

464

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

465

}

466

#endif // defined(MAT_MUL_NATIVE_T_NT)

467

468

#if defined(MAT_MUL_NATIVE_T_T)

469

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only

470

*

471

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

472

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

473

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

474

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

475

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

476

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

477

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

478

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

479

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)

480

* @note Only the following configurations of M0, N0 and K0 are currently supported:

481

* - M0 = 1, 2, 3, 4, 8, 16

482

* - N0 = 1, 2, 3, 4, 8, 16

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

483

* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

484

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

485

*

486

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

487

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

488

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

489

* @param[in] lhs_w The width of the lhs tensor

490

* @param[in] lhs_h The height of the lhs tensor

491

* @param[in] lhs_n Number of the matrices (buffers) in the batch

492

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

493

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

494

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

495

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

496

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

497

* @param[in] rhs_w The width of the rhs tensor

498

* @param[in] rhs_h The height of the rhs tensor

499

* @param[in] rhs_n Number of the matrices (buffers) in the batch

500

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

501

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

502

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

503

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

504

* @param[in] dst_w The width of the dst tensor

505

* @param[in] dst_h The height of the dst tensor

506

* @param[in] dst_n Number of the matrices (buffers) in the batch

507

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

508

*/

509

__kernel void mat_mul_native_t_t(

510

TENSOR3D_T(lhs, BUFFER),

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

511

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

512

TENSOR3D_T(dst, BUFFER))

513

{

514

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

515

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

516

const uint z = GET_SPATIAL_IDX(2, 1, 0);

517

518

// Compute LHS/RHS/DST matrix address

519

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

520

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

521

522

// Initialize the accumulators

523

TILE(DATA_TYPE, M0, N0, acc);

524

525

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

530

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

531

int k;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

532

for(k = 0; k <= K - K0; k += K0)

533

{

534

TILE(DATA_TYPE, K0, M0, a);

535

TILE(DATA_TYPE, N0, K0, b);

536

537

LOOP_UNROLLING(int, i, 0, 1, K0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

548

T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

549

T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

550

#if GPU_ARCH == GPU_ARCH_MIDGARD

551

// For explanation, see mat_mul_native_nt_t

552

TILE(DATA_TYPE, M0, K0, at);

553

TILE(DATA_TYPE, K0, N0, bt);

554

555

LOOP_UNROLLING(int, i, 0, 1, K0,

556

{

557

LOOP_UNROLLING(int, j, 0, 1, M0,

558

{

559

at[j].s[i] = a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

564

{

565

LOOP_UNROLLING(int, j, 0, 1, K0,

566

{

567

bt[j].s[i] = b[i].s[j];

})

})

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);

572

#else // GPU_ARCH == GPU_ARCH_MIDGARD

573

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);

574

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

575

576

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

577

}

578

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

579

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, 1, M0, a);

584

TILE(DATA_TYPE, N0, 1, b);

585

586

LOOP_UNROLLING(int, i, 0, 1, 1,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

597

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

598

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

599

600

#if GPU_ARCH == GPU_ARCH_MIDGARD

601

// For explanation, see mat_mul_native_nt_t

602

TILE(DATA_TYPE, M0, 1, at);

603

TILE(DATA_TYPE, 1, N0, bt);

604

605

LOOP_UNROLLING(int, j, 0, 1, M0,

606

{

607

at[j].s[0] = a[0].s[j];

608

})

609

610

LOOP_UNROLLING(int, i, 0, 1, N0,

611

{

612

bt[0].s[i] = b[i].s[0];

613

})

614

615

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);

616

#else // GPU_ARCH == GPU_ARCH_MIDGARD

617

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);

618

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

619

620

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

621

}

622

#endif // K % K0 != 0

623

624

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

625

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

626

627

TILE(int, M0, 1, indirect_buffer);

628

LOOP_UNROLLING(int, _i, 0, 1, M0,

629

{

630

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

631

});

632

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame^]

633

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

634

Gunes Bayir