Blame - src/core/CL/cl_kernels/common/mat_mul_mmul.cl - ml/ComputeLibrary

* @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).

34

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

35

* @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).

SiCong Li

a8d8058

2023-05-19 14:23:37 +0100

[diff] [blame]

36

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)

37

* @note Only the following configurations of M0, N0 and K0 are currently supported:

38

* - M0 > 0

39

* - N0 = 1, 2, 3, 4, 8, 16

40

* - K0 = 1

41

* @note Values > 8 for M0 are not expected to be efficient

42

*

43

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

44

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

45

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

46

* @param[in] lhs_w The width of the lhs tensor

47

* @param[in] lhs_h The height of the lhs tensor

48

* @param[in] lhs_n Number of the matrices (buffers) in the batch

49

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

50

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

51

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

52

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

53

* @param[in] rhs_w The width of the rhs tensor

54

* @param[in] rhs_h The height of the rhs tensor

55

* @param[in] rhs_n Number of the matrices (buffers) in the batch

56

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

57

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

58

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

59

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

60

* @param[in] dst_w The width of the dst tensor

61

* @param[in] dst_h The height of the dst tensor

62

* @param[in] dst_n Number of the matrices (buffers) in the batch

63

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

64

* @param[in] M Number of rows in LHS matrix

65

* @param[in] N Number of columns in RHS matrix

Ramy Elgammal

c952596

2023-05-19 14:23:37 +0100

[diff] [blame^]

66

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix, both not transposed.

SiCong Li

a8d8058

2023-05-19 14:23:37 +0100

[diff] [blame]

67

*/

68

__kernel void mat_mul_native_mmul_nt_nt(

69

TENSOR3D_T(lhs, BUFFER),

70

TENSOR3D_T(rhs, BUFFER),

71

TENSOR3D_T(dst, BUFFER),

72

const int M,

Ramy Elgammal

c952596

2023-05-19 14:23:37 +0100

[diff] [blame^]

73

const int N,

74

const int K)

SiCong Li

a8d8058

2023-05-19 14:23:37 +0100

[diff] [blame]

75

{

76

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)

77

78

const uint x0 = get_global_id(0); // (N / N0) * MMUL_M0

79

const uint y0 = get_global_id(1); // (M / M0) / MMUL_M0

80

const uint z = get_global_id(2); // Batch

81

82

// Get block coordinates

83

const uint block_x = (x0 / MMUL_BLOCK_SIZE);

84

const uint block_y = y0;

85

86

// Get thread coordinates within a block

87

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

88

const uint thread_x = thread_id % MMUL_N0;

89

const uint thread_y = (thread_id / MMUL_N0);

90

91

// Starting destination coordinates

92

// Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication

93

// part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results

94

// Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.

95

const uint dst_x_unclamped = thread_x * N0 + block_x * N0 * MMUL_N0;

96

const uint dst_y_unclamped = thread_y * M0 + block_y * M0 * MMUL_M0;

97

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

98

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

99

100

// Starting LHS coordinates

101

const uint lhs_x = thread_x;

102

const uint lhs_y = dst_y;

103

104

// Starting RHS coordinates

105

const uint rhs_x = dst_x;

106

const uint rhs_y = thread_y;

107

108

// Compute LHS/RHS/DST matrix address

109

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

110

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

111

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

112

113

// Initialize the accumulators

114

// MMUL extension accumulate the result in F32 for both F32 and F16

115

TILE(float, M0, N0, c_f32);

116

117

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

123

{

124

// A tile of M0xK0 but K0 must be set to 1

125

TILE(DATA_TYPE, M0, 1, a);

126

// A tile of K0xN0 but K0 must be set to 1

127

TILE(DATA_TYPE, 1, N0, b);

128

129

// Load tile from the lhs/rhs tensors

130

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

131

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

132

133

LOOP_UNROLLING(int, m0, 0, 1, M0,

134

{

135

LOOP_UNROLLING(int, n0, 0, 1, N0,

136

{

137

c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

142

rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;

143

}

144

145

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

146

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

152

TILE(DATA_TYPE, M0, N0, c);

153

154

// Conversion required for the half precision

155

LOOP_UNROLLING(int, m0, 0, 1, M0,

156

{

157

LOOP_UNROLLING(int, n0, 0, 1, N0,

158

{

159

c[m0].s[n0] = c_f32[m0].s[n0];

160

})

161

})

162

#else // defined(HALF_PRECISION)

163

#define c c_f32

164

#endif // defined(HALF_PRECISION)

165

166

if(dst_x + N0 <= N || N0_LEFTOVER == 0)

167

{

168

LOOP_UNROLLING(int, m0, 0, 1, M0,

169

{

170

if(dst_y + m0 < M || M0_LEFTOVER == 0)

171

{

172

VSTORE(N0)

173

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

else

{

LOOP_UNROLLING(int, m0, 0, 1, M0,

180

{

181

if(dst_y + m0 < M || M0_LEFTOVER == 0)

182

{

183

VSTORE_PARTIAL(N0, N0_LEFTOVER)

184

(c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));

}

})

}

#undef MMUL_BLOCK_SIZE

190

}

191

#endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)

Ramy Elgammal

c952596

2023-05-19 14:23:37 +0100

[diff] [blame^]

192

193

#if defined(MAT_MUL_NATIVE_MMUL_NT_T)

194

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only

195

*

196

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

197

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

198

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

199

* @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).

200

* @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)

201

* @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).

202

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)

203

* @note Only the following configurations of M0, N0 and K0 are currently supported:

204

* - M0 > 0

205

* - N0 = 1, 2, 3, 4, 8, 16

206

* - K0 = 1

207

* @note Values > 8 for M0 are not expected to be efficient

208

*

209

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

210

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

211

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

212

* @param[in] lhs_w The width of the lhs tensor

213

* @param[in] lhs_h The height of the lhs tensor

214

* @param[in] lhs_n Number of the matrices (buffers) in the batch

215

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

216

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

217

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

218

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

219

* @param[in] rhs_w The width of the rhs tensor

220

* @param[in] rhs_h The height of the rhs tensor

221

* @param[in] rhs_n Number of the matrices (buffers) in the batch

222

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

223

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

224

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

225

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

226

* @param[in] dst_w The width of the dst tensor

227

* @param[in] dst_h The height of the dst tensor

228

* @param[in] dst_n Number of the matrices (buffers) in the batch

229

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

230

* @param[in] M Number of rows in LHS matrix

231

* @param[in] N Number of columns in RHS matrix

232

* @param[in] K Number of columns in LHS matrix and columns in RHS-Transposed matrix, which is multiple of MMUL_K0.

233

*/

234

__kernel void mat_mul_native_mmul_nt_t(

235

TENSOR3D_T(lhs, BUFFER),

236

TENSOR3D_T(rhs, BUFFER),

237

TENSOR3D_T(dst, BUFFER),

const int M,

const int N,

const int K)

{

#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)

243

244

const uint x0 = get_global_id(0); // (N / N0) * MMUL_M0

245

const uint y0 = get_global_id(1); // (M / M0) / MMUL_M0

246

const uint z = get_global_id(2); // Batch

247

248

// Get block coordinates

249

const uint block_x = (x0 / MMUL_BLOCK_SIZE);

250

const uint block_y = y0;

251

252

// Get thread coordinates within a block

253

const uint thread_id = (x0 % MMUL_BLOCK_SIZE);

254

const uint thread_x = thread_id % MMUL_N0;

255

const uint thread_y = (thread_id / MMUL_N0);

256

257

// Starting destination coordinates

258

// Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication

259

// part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results

260

// Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.

261

const uint dst_x_unclamped = thread_x * N0 + block_x * N0 * MMUL_N0;

262

const uint dst_y_unclamped = thread_y * M0 + block_y * M0 * MMUL_M0;

263

const uint dst_x = min(dst_x_unclamped, (uint)(N - N0));

264

const uint dst_y = min(dst_y_unclamped, (uint)(M - M0));

265

266

// Starting LHS coordinates

267

const uint lhs_x = thread_x;

268

const uint lhs_y = dst_y;

269

270

// Starting RHS coordinates

271

const uint rhs_x = thread_y;

272

const uint rhs_y = dst_x;

273

274

// Compute LHS/RHS/DST matrix address

275

lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;

276

rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;

277

dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;

278

279

// Initialize the accumulators

280

// MMUL extension accumulate the result in F32 for both F32 and F16

281

TILE(float, M0, N0, c_f32);

282

283

LOOP_UNROLLING(int, i, 0, 1, M0,

{

c_f32[i].v = 0;

})

for(int k = 0; k < K; k += MMUL_K0)

289

{

290

// A tile of M0xK0 but K0 must be set to 1

291

TILE(DATA_TYPE, M0, 1, a);

292

// A tile of N0xK0 but K0 must be set to 1

293

TILE(DATA_TYPE, N0, 1, b);

294

295

// Load tile from the lhs/rhs tensors

296

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

297

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

298

299

LOOP_UNROLLING(int, m0, 0, 1, M0,

300

{

301

LOOP_UNROLLING(int, n0, 0, 1, N0,

302

{

303

c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);

})

})

lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);

308

rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);

309

}

310

311

// For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply

312

if(dst_x_unclamped >= N || dst_y_unclamped >= M)

{

return;

}

#if defined(HALF_PRECISION)

318