Blame - src/core/CL/cl_kernels/common/mat_mul_quantized.cl - ml/ComputeLibrary

2023-05-25 16:48:43 +0100

[diff] [blame]

51

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

52

* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

53

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

54

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)

55

* @note Only the following configurations of M0, N0 and K0 are currently supported:

56

* - M0 > 0

57

* - N0 = 1, 2, 3, 4, 8, 16

58

* - K0 = 1, 2, 3, 4, 8, 16

59

* @note Values > 8 for M0 are not expected to be efficient

60

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

61

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8

62

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

63

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

64

* @param[in] lhs_w The width of the lhs tensor

65

* @param[in] lhs_h The height of the lhs tensor

66

* @param[in] lhs_n Number of the matrices (buffers) in the batch

67

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

68

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

69

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

70

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

71

* @param[in] rhs_w The width of the rhs tensor

72

* @param[in] rhs_h The height of the rhs tensor

73

* @param[in] rhs_n Number of the matrices (buffers) in the batch

74

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

75

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

76

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

77

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

78

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

79

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

80

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

81

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

82

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

83

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

84

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

85

* @param[in] dst_w The width of the dst tensor

86

* @param[in] dst_h The height of the dst tensor

87

* @param[in] dst_n Number of the matrices (buffers) in the batch

88

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

89

*/

90

__kernel void mat_mul_native_quantized_nt_nt(

91

TENSOR3D_T(lhs, BUFFER),

92

TENSOR3D_T(rhs, BUFFER),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

93

#ifdef BIAS

94

TENSOR3D_T(bias, BUFFER),

95

#endif // defined(BIAS)

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

96

TENSOR3D_T(dst, BUFFER))

97

{

98

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

99

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

100

const uint z = GET_SPATIAL_IDX(2, 1, 0);

101

102

// Compute LHS/RHS/DST matrix address

103

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

104

rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;

105

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

106

107

// Initialize the accumulators

108

TILE(int, M0, N0, acc);

109

LOOP_UNROLLING(int, i, 0, 1, M0,

110

{

111

acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);

112

})

113

114

TILE(int, 1, N0, b_sum);

115

b_sum[0].v = 0;

116

117

TILE(int, 1, M0, a_sum);

a_sum[0].v = 0;

int k;

for(k = 0; k <= K - K0; k += K0)

122

{

123

TILE(DATA_TYPE, M0, K0, a);

124

TILE(DATA_TYPE, N0, K0, b);

125

126

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs tensor

137

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

138

139

// Load tile from the rhs tensor in a transposed fashion

140

// in order to use T_MMUL_NT_T macro because only this macro

141

// can utilize dot product instruction for Int8/UInt8 by

142

// directly multiplying the rows of Lhs and Rhs tensors.

143

T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

144

145

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);

146

147

LOOP_UNROLLING(int, i, 0, 1, M0,

148

{

149

LOOP_UNROLLING(int, j, 0, 1, K0,

150

{

151

a_sum[0].s[i] += (int)a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, K0,

156

{

157

LOOP_UNROLLING(int, j, 0, 1, N0,

158

{

159

b_sum[0].s[j] += (int)b[j].s[i];

})

})

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

164

rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;

}

#if((K % K0) != 0)

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

172

TILE(DATA_TYPE, N0, 1, b);

173

174

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs tensor

185

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

186

187

// Load tile from the rhs tensor in a transposed fashion.

188

// See the main loop for more explanation

189

T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

190

191

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);

192

193

LOOP_UNROLLING(int, i, 0, 1, M0,

194

{

195

LOOP_UNROLLING(int, j, 0, 1, 1,

196

{

197

a_sum[0].s[i] += (int)a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, 1,

202

{

203

LOOP_UNROLLING(int, j, 0, 1, N0,

204

{

205

b_sum[0].s[j] += (int)b[j].s[i];

})

})

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

210

rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;

211

}

212

#endif // ((K % K0) != 0)

213

214

LOOP_UNROLLING(int, i, 0, 1, M0,

215

{

216

LOOP_UNROLLING(int, j, 0, 1, N0,

217

{

Mohammed Suhail Munshi

2023-06-19 14:57:57 +0100

[diff] [blame]

218

acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

})

})

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

223

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

224

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

225

#ifdef BIAS

226

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

227

#endif // defined(BIAS)

228

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

229

// Quantize the tile

230

TILE(DATA_TYPE, M0, N0, accq);

231

T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);

232

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

233

T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);

234

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

235

TILE(int, M0, 1, indirect_buffer);

236

LOOP_UNROLLING(int, _i, 0, 1, M0,

237

{

238

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

239

});

240

241

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);

242

}

243

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)

244

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

245

#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)

246

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only

247

*

248

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

249

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

250

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)

251

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

252

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

253

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

254

* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

255

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

256

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)

257

* @note Only the following configurations of M0, N0 and K0 are currently supported:

258

* - M0 > 0

259

* - N0 = 1, 2, 3, 4, 8, 16

260

* - K0 = 1, 2, 3, 4, 8, 16

261

* @note Values > 8 for M0, N0, K0 are not expected to be efficient

262

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

263

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED

264

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

265

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

266

* @param[in] lhs_w The width of the lhs tensor

267

* @param[in] lhs_h The height of the lhs tensor

268

* @param[in] lhs_n Number of the matrices (buffers) in the batch

269

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

270

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

271

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

272

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

273

* @param[in] rhs_w The width of the rhs tensor

274

* @param[in] rhs_h The height of the rhs tensor

275

* @param[in] rhs_n Number of the matrices (buffers) in the batch

276

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

277

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

278

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

279

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

280

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

281

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

282

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

283

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

284

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

285

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

286

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

287

* @param[in] dst_w The width of the dst tensor

288

* @param[in] dst_h The height of the dst tensor

289

* @param[in] dst_n Number of the matrices (buffers) in the batch

290

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

291

*/

292

__kernel void mat_mul_native_quantized_nt_t(

293

TENSOR3D_T(lhs, BUFFER),

294

TENSOR3D_T(rhs, BUFFER),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

295

#ifdef BIAS

296

TENSOR3D_T(bias, BUFFER),

297

#endif // defined(BIAS)

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

298

TENSOR3D_T(dst, BUFFER))

299

{

300

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

301

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

302

const uint z = GET_SPATIAL_IDX(2, 1, 0);

303

304

// Compute LHS/RHS/DST matrix address

305

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

306

rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;

307

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

308

309

// Initialize the accumulators

310

TILE(int, M0, N0, acc);

311

LOOP_UNROLLING(int, i, 0, 1, M0,

312

{

313

acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);

314

})

315

316

TILE(int, 1, M0, a_sum);

317

a_sum[0].v = 0;

318

319

TILE(int, 1, N0, b_sum);

b_sum[0].v = 0;

int k;

for(k = 0; k <= K - K0; k += K0)

324

{

325

TILE(DATA_TYPE, M0, K0, a);

326

TILE(DATA_TYPE, N0, K0, b);

327

328

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from lhs/rhs tensors

339

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

340

T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

341

342

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);

343

344

LOOP_UNROLLING(int, i, 0, 1, M0,

345

{

346

LOOP_UNROLLING(int, j, 0, 1, K0,

347

{

348

a_sum[0].s[i] += (int)a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

353

{

354

LOOP_UNROLLING(int, j, 0, 1, K0,

355

{

356

b_sum[0].s[i] += (int)b[i].s[j];

})

})

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

361

rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

362

}

363

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

364

#if((K % K0) != 0)

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

// Leftover loop

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

369

TILE(DATA_TYPE, N0, 1, b);

370

371

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from lhs/rhs tensors

382

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

383

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

384

385

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);

386

387

LOOP_UNROLLING(int, i, 0, 1, M0,

388

{

389

LOOP_UNROLLING(int, j, 0, 1, 1,

390

{

391

a_sum[0].s[i] += (int)a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

396

{

397

LOOP_UNROLLING(int, j, 0, 1, 1,

398

{

399

b_sum[0].s[i] += (int)b[i].s[j];

})

})

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

404

rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

405

}

406

#endif // ((K % K0) != 0)

407

408

LOOP_UNROLLING(int, i, 0, 1, M0,

409

{

410

LOOP_UNROLLING(int, j, 0, 1, N0,

411

{

Mohammed Suhail Munshi

2023-06-19 14:57:57 +0100

[diff] [blame]

412

acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

})

})

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

417

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

418

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

419

#ifdef BIAS

420

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

421

#endif // defined(BIAS)

422

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

423

// Quantize the tile

424

TILE(DATA_TYPE, M0, N0, accq);

425

T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);

426

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

427

T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);

428

Jakub Sujak

2023-04-18 08:33:56 +0100

[diff] [blame]

429

TILE(int, M0, 1, indirect_buffer);

430

LOOP_UNROLLING(int, _i, 0, 1, M0,

431

{

432

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

433

});

434

435

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);

436

}

437

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)

438

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

439

#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)

440

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed

441

*

442

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

443

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

444

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)

445

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

446

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

447

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

448

* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

449

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

450

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)

451

* @note Only the following configurations of M0, N0 and K0 are currently supported:

452

* - M0 > 0

453

* - N0 = 1, 2, 3, 4, 8, 16

454

* - K0 = 1, 2, 3, 4, 8, 16

455

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

456

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

457

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED

458

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

459

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

460

* @param[in] lhs_w The width of the lhs tensor

461

* @param[in] lhs_h The height of the lhs tensor

462

* @param[in] lhs_n Number of the matrices (buffers) in the batch

463

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

464

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

465

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

466

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

467

* @param[in] rhs_w The width of the rhs tensor

468

* @param[in] rhs_h The height of the rhs tensor

469

* @param[in] rhs_n Number of the matrices (buffers) in the batch

470

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

471

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

472

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

473

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

474

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

475

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

476

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

477

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

478

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

479

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

480

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

481

* @param[in] dst_w The width of the dst tensor

482

* @param[in] dst_h The height of the dst tensor

483

* @param[in] dst_n Number of the matrices (buffers) in the batch

484

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

485

*/

486

__kernel void mat_mul_native_quantized_t_nt(

487

TENSOR3D_T(lhs, BUFFER),

488

TENSOR3D_T(rhs, BUFFER),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

489

#ifdef BIAS

490

TENSOR3D_T(bias, BUFFER),

491

#endif // defined(BIAS)

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

492

TENSOR3D_T(dst, BUFFER))

493

{

494

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

495

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

496

const uint z = GET_SPATIAL_IDX(2, 1, 0);

497

498

// Compute LHS/RHS/DST matrix address

499

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

500

rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;

501

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

502

503

// Initialize the accumulators

504

TILE(int, M0, N0, acc);

505

LOOP_UNROLLING(int, i, 0, 1, M0,

506

{

507

acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);

508

})

509

510

TILE(int, 1, N0, b_sum);

511

b_sum[0].v = 0;

512

513

TILE(int, 1, M0, a_sum);

a_sum[0].v = 0;

int k;

for(k = 0; k <= K - K0; k += K0)

518

{

519

TILE(DATA_TYPE, M0, K0, a);

520

TILE(DATA_TYPE, N0, K0, b);

521

522

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs/rhs tensors in a transposed fashion

533

// see mat_mul_native_quantized_nt_nt main loop for more explanation

534

T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

535

T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

536

537

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);

538

539

LOOP_UNROLLING(int, i, 0, 1, K0,

540

{

541

LOOP_UNROLLING(int, j, 0, 1, M0,

542

{

543

a_sum[0].s[j] += (int)a[j].s[i];

})

})

LOOP_UNROLLING(int, i, 0, 1, K0,

548

{

549

LOOP_UNROLLING(int, j, 0, 1, N0,

550

{

551

b_sum[0].s[j] += (int)b[j].s[i];

})

})

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

556

rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;

}

#if((K % K0) != 0)

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

564

TILE(DATA_TYPE, N0, 1, b);

565

566

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs/rhs tensors in a transposed fashion

577

// see mat_mul_native_quantized_nt_nt main loop for more explanation

578

T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

579

T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

580

581

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);

582

583

LOOP_UNROLLING(int, i, 0, 1, 1,

584

{

585

LOOP_UNROLLING(int, j, 0, 1, M0,

586

{

587

a_sum[0].s[j] += (int)a[j].s[i];

})

})

LOOP_UNROLLING(int, i, 0, 1, 1,

592

{

593

LOOP_UNROLLING(int, j, 0, 1, N0,

594

{

595

b_sum[0].s[j] += (int)b[j].s[i];

})

})

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

600

rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;

601

}

602

#endif // ((K % K0) != 0)

603

604

LOOP_UNROLLING(int, i, 0, 1, M0,

605

{

606

LOOP_UNROLLING(int, j, 0, 1, N0,

607

{

Mohammed Suhail Munshi

2023-06-19 14:57:57 +0100

[diff] [blame]

608

acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

})

})

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

613

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

614

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

615

#ifdef BIAS

616

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

617

#endif // defined(BIAS)

618

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

619

// Quantize the tile

620

TILE(DATA_TYPE, M0, N0, accq);

621

T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);

622

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

623

T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);

624

Gunes Bayir

2023-04-13 18:22:58 +0100

[diff] [blame]

625

TILE(int, M0, 1, indirect_buffer);

626

LOOP_UNROLLING(int, _i, 0, 1, M0,

627

{

628

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

629

});

630

631

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);

632

}

633

#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

634

635

#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)

636

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed

637

*

638

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

639

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

640

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)

641

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

642

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

643

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

644

* @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

645

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

646

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)

647

* @note Only the following configurations of M0, N0 and K0 are currently supported:

648

* - M0 = 1, 2, 3, 4, 8, 16

649

* - N0 = 1, 2, 3, 4, 8, 16

650

* - K0 = 1, 2, 3, 4, 8, 16

651

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

652

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

653

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED

654

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

655

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

656

* @param[in] lhs_w The width of the lhs tensor

657

* @param[in] lhs_h The height of the lhs tensor

658

* @param[in] lhs_n Number of the matrices (buffers) in the batch

659

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

660

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

661

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

662

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

663

* @param[in] rhs_w The width of the rhs tensor

664

* @param[in] rhs_h The height of the rhs tensor

665

* @param[in] rhs_n Number of the matrices (buffers) in the batch

666

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

667

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

668

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

669

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

670

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

671

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

672

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

673

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

674

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

675

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

676

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

677

* @param[in] dst_w The width of the dst tensor

678

* @param[in] dst_h The height of the dst tensor

679

* @param[in] dst_n Number of the matrices (buffers) in the batch

680

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

681

*/

682

__kernel void mat_mul_native_quantized_t_t(

683

TENSOR3D_T(lhs, BUFFER),

684

TENSOR3D_T(rhs, BUFFER),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

685

#ifdef BIAS

686

TENSOR3D_T(bias, BUFFER),

687

#endif // defined(BIAS)

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

688

TENSOR3D_T(dst, BUFFER))

689

{

690

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

691

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

692

const uint z = GET_SPATIAL_IDX(2, 1, 0);

693

694

// Compute LHS/RHS/DST matrix address

695

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

696

rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;

697

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

698

699

// Initialize the accumulators

700

TILE(int, M0, N0, acc);

701

LOOP_UNROLLING(int, i, 0, 1, M0,

702

{

703

acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);

704

})

705

706

TILE(int, 1, M0, a_sum);

707

a_sum[0].v = 0;

708

709

TILE(int, 1, N0, b_sum);

b_sum[0].v = 0;

int k;

for(k = 0; k <= K - K0; k += K0)

714

{

715

TILE(DATA_TYPE, M0, K0, a);

716

TILE(DATA_TYPE, N0, K0, b);

717

718

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs tensor in a transposed fashion

729

// see mat_mul_native_quantized_nt_nt main loop for more explanation

730

T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

731

732

// Load tile from the rhs tensor

733

T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

734

735

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);

736

737

LOOP_UNROLLING(int, i, 0, 1, K0,

738

{

739

LOOP_UNROLLING(int, j, 0, 1, M0,

740

{

741

a_sum[0].s[j] += (int)a[j].s[i];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

746

{

747

LOOP_UNROLLING(int, j, 0, 1, K0,

748

{

749

b_sum[0].s[i] += (int)b[i].s[j];

})

})

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

754

rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

}

#if((K % K0) != 0)

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

762

TILE(DATA_TYPE, N0, 1, b);

763

764

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0;

})

// Load tile from the lhs tensor in a transposed fashion

775

// see mat_mul_native_quantized_nt_nt main loop for more explanation

776

T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

777

778

// Load tile from the rhs tensor

779

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);

780

781

T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);

782

783

LOOP_UNROLLING(int, i, 0, 1, 1,

784

{

785

LOOP_UNROLLING(int, j, 0, 1, M0,

786

{

787

a_sum[0].s[j] += (int)a[j].s[i];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

792

{

793

LOOP_UNROLLING(int, j, 0, 1, 1,

794

{

795

b_sum[0].s[i] += (int)b[i].s[j];

})

})

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

800

rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

801

}

802

#endif // ((K % K0) != 0)

803

804

LOOP_UNROLLING(int, i, 0, 1, M0,

805

{

806

LOOP_UNROLLING(int, j, 0, 1, N0,

807

{

Mohammed Suhail Munshi

2023-06-19 14:57:57 +0100

[diff] [blame]

808

acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

})

})

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

813

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

814

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

815

#ifdef BIAS

816

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

817

#endif // defined(BIAS)

818

Omar Al Khatib

2023-04-13 14:56:23 +0100

[diff] [blame]

819

// Quantize the tile

820

TILE(DATA_TYPE, M0, N0, accq);

821

T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);

822

Mohammed Suhail Munshi

2023-06-30 15:43:29 +0100

[diff] [blame]

823

T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);

824

Omar Al Khatib