Blame - src/core/CL/cl_kernels/common/mat_mul.cl - ml/ComputeLibrary

2023-03-09 21:15:37 +0000

[diff] [blame]

43

#if defined(MAT_MUL_NATIVE_NT_NT)

44

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only

45

*

46

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

47

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

48

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

49

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

50

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

51

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

52

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

53

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

54

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

55

* @note Only the following configurations of M0, N0 and K0 are currently supported:

56

* - M0 > 0

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

57

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

58

* - K0 = 1, 2, 3, 4, 8, 16

59

* @note Values > 8 for M0 are not expected to be efficient

60

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

61

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

62

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

63

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

64

* @param[in] lhs_w The width of the lhs tensor

65

* @param[in] lhs_h The height of the lhs tensor

66

* @param[in] lhs_n Number of the matrices (buffers) in the batch

67

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

68

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

69

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

70

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

71

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

72

* @param[in] rhs_w The width of the rhs tensor

73

* @param[in] rhs_h The height of the rhs tensor

74

* @param[in] rhs_n Number of the matrices (buffers) in the batch

75

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

76

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

77

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

78

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

79

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

80

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

81

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

82

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

83

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

84

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

85

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

86

* @param[in] dst_w The width of the dst tensor

87

* @param[in] dst_h The height of the dst tensor

88

* @param[in] dst_n Number of the matrices (buffers) in the batch

89

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

90

*/

91

__kernel void mat_mul_native_nt_nt(

92

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

93

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

94

#ifdef BIAS

95

TENSOR3D_T(bias, BUFFER),

96

#endif // defined(BIAS)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

97

TENSOR3D_T(dst, BUFFER))

98

{

99

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

100

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

101

const uint z = GET_SPATIAL_IDX(2, 1, 0);

102

103

// Compute LHS/RHS/DST matrix address

104

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

105

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

106

107

// Initialize the accumulators

108

TILE(DATA_TYPE, M0, N0, acc);

109

110

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

115

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

116

int k;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

117

for(k = 0; k <= K - K0; k += K0)

118

{

119

TILE(DATA_TYPE, M0, K0, a);

120

TILE(DATA_TYPE, K0, N0, b);

121

122

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

133

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

134

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

135

136

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);

137

138

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

139

}

140

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

141

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

142

/* Leftover Loop */

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

143

for(; k < K; ++k)

144

{

145

TILE(DATA_TYPE, M0, 1, a);

146

TILE(DATA_TYPE, 1, N0, b);

147

148

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

159

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

160

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

161

162

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);

163

164

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

165

}

166

#endif // K % K0 != 0

167

168

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

169

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

170

171

TILE(int, M0, 1, indirect_buffer);

172

LOOP_UNROLLING(int, _i, 0, 1, M0,

173

{

174

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

175

});

176

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

177

#ifdef BIAS

178

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

179

#endif // defined(BIAS)

180

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

181

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

182

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

183

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

184

}

185

#endif // defined(MAT_MUL_NATIVE_NT_NT)

186

187

#if defined(MAT_MUL_NATIVE_NT_T)

188

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only

189

*

190

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

191

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

192

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

193

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

194

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

195

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

196

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

197

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

198

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

199

* @note Only the following configurations of M0, N0 and K0 are currently supported:

200

* - M0 > 0

201

* - N0 = 1, 2, 3, 4, 8, 16

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

202

* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

203

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

204

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

205

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

206

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

207

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

208

* @param[in] lhs_w The width of the lhs tensor

209

* @param[in] lhs_h The height of the lhs tensor

210

* @param[in] lhs_n Number of the matrices (buffers) in the batch

211

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

212

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

213

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

214

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

215

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

216

* @param[in] rhs_w The width of the rhs tensor

217

* @param[in] rhs_h The height of the rhs tensor

218

* @param[in] rhs_n Number of the matrices (buffers) in the batch

219

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

220

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

221

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

222

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

223

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

224

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

225

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

226

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

227

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

228

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

229

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

230

* @param[in] dst_w The width of the dst tensor

231

* @param[in] dst_h The height of the dst tensor

232

* @param[in] dst_n Number of the matrices (buffers) in the batch

233

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

234

*/

235

__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

236

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

237

#ifdef BIAS

238

TENSOR3D_T(bias, BUFFER),

239

#endif // defined(BIAS)

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

240

TENSOR3D_T(dst, BUFFER))

241

242

{

243

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

244

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

245

const uint z = GET_SPATIAL_IDX(2, 1, 0);

246

247

// Compute LHS/RHS/DST matrix address

248

lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

249

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

250

251

// Initialize the accumulators

252

TILE(DATA_TYPE, M0, N0, acc);

253

254

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

259

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

260

int k;

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

261

for(k = 0; k <= K - K0; k += K0)

262

{

263

TILE(DATA_TYPE, M0, K0, a);

264

TILE(DATA_TYPE, N0, K0, b);

265

266

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

277

T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

278

T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

279

280

#if GPU_ARCH == GPU_ARCH_MIDGARD

281

// This part is written to decrease the number of loop unrollings caused

282

// by T_MMUL. The NT/NT version is partly vectorized and uses less number

283

// of loop unrollings, and code behaves as expected. Although this is not

284

// a performant solution for the specified architecture, it is necessary

285

// to overcome some limitations.

286

TILE(DATA_TYPE, K0, N0, bt);

287

LOOP_UNROLLING(int, i, 0, 1, N0,

288

{

289

LOOP_UNROLLING(int, j, 0, 1, K0,

290

{

291

bt[j].s[i] = b[i].s[j];

292

})

293

})

294

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

295

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

296

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);

297

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

298

299

lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

}

#if K % K0 != 0

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, M0, 1, a);

307

TILE(DATA_TYPE, N0, 1, b);

308

309

LOOP_UNROLLING(int, i, 0, 1, M0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

320

T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

321

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

322

323

#if GPU_ARCH == GPU_ARCH_MIDGARD

324

// See the main loop for the explanation of this part

325

TILE(DATA_TYPE, 1, N0, bt);

326

LOOP_UNROLLING(int, i, 0, 1, N0,

327

{

328

bt[0].s[i] = b[i].s[0];

329

})

330

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

331

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

332

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);

333

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

334

335

lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

336

}

337

#endif // K % K0 != 0

338

339

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

340

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

341

342

TILE(int, M0, 1, indirect_buffer);

343

LOOP_UNROLLING(int, _i, 0, 1, M0,

344

{

345

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

346

});

347

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

348

#ifdef BIAS

349

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

350

#endif // defined(BIAS)

351

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

352

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

353

Ramy Elgammal

2023-03-09 21:15:37 +0000

[diff] [blame]

354

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

355

}

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

356

#endif // defined(MAT_MUL_NATIVE_NT_T)

357

358

#if defined(MAT_MUL_NATIVE_T_NT)

359

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only

360

*

361

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

362

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

363

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

364

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

365

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

366

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

367

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

368

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

369

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)

370

* @note Only the following configurations of M0, N0 and K0 are currently supported:

371

* - M0 = 1, 2, 3, 4, 8, 16

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

372

* - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

373

* - K0 > 0

374

* * @note Values > 8 for M0, and K0 are not expected to be efficient

375

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

376

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

377

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

378

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

379

* @param[in] lhs_w The width of the lhs tensor

380

* @param[in] lhs_h The height of the lhs tensor

381

* @param[in] lhs_n Number of the matrices (buffers) in the batch

382

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

383

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

384

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

385

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

386

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

387

* @param[in] rhs_w The width of the rhs tensor

388

* @param[in] rhs_h The height of the rhs tensor

389

* @param[in] rhs_n Number of the matrices (buffers) in the batch

390

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

391

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

392

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

393

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

394

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

395

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

396

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

397

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

398

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr

399

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

400

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

401

* @param[in] dst_w The width of the dst tensor

402

* @param[in] dst_h The height of the dst tensor

403

* @param[in] dst_n Number of the matrices (buffers) in the batch

404

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

405

*/

406

__kernel void mat_mul_native_t_nt(

407

TENSOR3D_T(lhs, BUFFER),

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

408

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

409

#ifdef BIAS

410

TENSOR3D_T(bias, BUFFER),

411

#endif // defined(BIAS)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

412

TENSOR3D_T(dst, BUFFER))

413

{

414

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

415

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

416

const uint z = GET_SPATIAL_IDX(2, 1, 0);

417

418

// Compute LHS/RHS/DST matrix address

419

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

420

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

421

422

// Initialize the accumulators

423

TILE(DATA_TYPE, M0, N0, acc);

424

425

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

430

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

431

int k;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

432

for(k = 0; k <= K - K0; k += K0)

433

{

434

TILE(DATA_TYPE, K0, M0, a);

435

TILE(DATA_TYPE, K0, N0, b);

436

437

LOOP_UNROLLING(int, i, 0, 1, K0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, K0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

448

T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

449

T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

450

451

#if GPU_ARCH == GPU_ARCH_MIDGARD

452

// For explanation, see mat_mul_native_nt_t

453

TILE(DATA_TYPE, M0, K0, at);

454

LOOP_UNROLLING(int, i, 0, 1, K0,

455

{

456

LOOP_UNROLLING(int, j, 0, 1, M0,

457

{

458

at[j].s[i] = a[i].s[j];

459

})

460

})

461

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);

462

#else // GPU_ARCH == GPU_ARCH_MIDGARD

463

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);

464

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

465

466

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

467

}

468

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

469

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, 1, M0, a);

474

TILE(DATA_TYPE, 1, N0, b);

475

476

LOOP_UNROLLING(int, i, 0, 1, 1,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, 1,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

487

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Gunes Bayir

2023-03-20 10:19:10 +0000

[diff] [blame]

488

T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

489

490

#if GPU_ARCH == GPU_ARCH_MIDGARD

491

// For explanation, see mat_mul_native_nt_t

492

TILE(DATA_TYPE, M0, 1, at);

493

LOOP_UNROLLING(int, j, 0, 1, M0,

494

{

495

at[j].s[0] = a[0].s[j];

496

})

497

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);

498

#else // GPU_ARCH == GPU_ARCH_MIDGARD

499

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);

500

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

501

502

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

503

}

504

#endif // K % K0 != 0

505

506

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

507

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

508

509

TILE(int, M0, 1, indirect_buffer);

510

LOOP_UNROLLING(int, _i, 0, 1, M0,

511

{

512

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

513

});

514

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

515

#ifdef BIAS

516

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

517

#endif // defined(BIAS)

518

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

519

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

520

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

521

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);

522

}

523

#endif // defined(MAT_MUL_NATIVE_T_NT)

524

525

#if defined(MAT_MUL_NATIVE_T_T)

526

/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only

527

*

528

* @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it

529

* should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension

530

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

531

* @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

532

* @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

533

* @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)

534

* @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

535

* @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

536

* @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)

537

* @note Only the following configurations of M0, N0 and K0 are currently supported:

538

* - M0 = 1, 2, 3, 4, 8, 16

539

* - N0 = 1, 2, 3, 4, 8, 16

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

540

* - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

541

* @note Values > 8 for M0, N0 and K0 are not expected to be efficient

542

*

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

543

* @param[in] lhs_ptr Pointer to the lhs matrix. Supported data types: F32/F16

544

* @param[in] lhs_stride_y Stride of the lhs matrix in Y (2nd) dimension (in bytes)

545

* @param[in] lhs_stride_z Stride of the lhs tensor in Z (3rd) dimension (in bytes)

546

* @param[in] lhs_w The width of the lhs tensor

547

* @param[in] lhs_h The height of the lhs tensor

548

* @param[in] lhs_n Number of the matrices (buffers) in the batch

549

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix

550

* @param[in] rhs_img (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE

551

* @param[in] rhs_ptr Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr

552

* @param[in] rhs_stride_y Stride of the rhs matrix in Y (2nd) dimension (in bytes)

553

* @param[in] rhs_stride_z Stride of the rhs tensor in Z (3rd) dimension (in bytes)

554

* @param[in] rhs_w The width of the rhs tensor

555

* @param[in] rhs_h The height of the rhs tensor

556

* @param[in] rhs_n Number of the matrices (buffers) in the batch

557

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix

558

* @param[in] bias_ptr (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr

559

* @param[in] bias_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)

560

* @param[in] bias_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)

561

* @param[in] bias_w (Optional) The size of the width dimension of the bias tensor

562

* @param[in] bias_h (Optional) The size of the height dimension of the bias tensor

563

* @param[in] bias_n (Optional) The size of the depth dimension of the bias tensor

564

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor

565

* @param[out] dst_ptr Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,

566

* @param[in] dst_stride_y Stride of the dst matrix in Y (2nd) dimension (in bytes)

567

* @param[in] dst_stride_z Stride of the dst tensor in Z (3rd) dimension (in bytes)

568

* @param[in] dst_w The width of the dst tensor

569

* @param[in] dst_h The height of the dst tensor

570

* @param[in] dst_n Number of the matrices (buffers) in the batch

571

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the dst matrix

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

572

*/

573

__kernel void mat_mul_native_t_t(

574

TENSOR3D_T(lhs, BUFFER),

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

575

TENSOR3D_T(rhs, RHS_TENSOR_TYPE),

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

576

#ifdef BIAS

577

TENSOR3D_T(bias, BUFFER),

578

#endif // defined(BIAS)

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

579

TENSOR3D_T(dst, BUFFER))

580

{

581

const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);

582

const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);

583

const uint z = GET_SPATIAL_IDX(2, 1, 0);

584

585

// Compute LHS/RHS/DST matrix address

586

lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

587

dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;

588

589

// Initialize the accumulators

590

TILE(DATA_TYPE, M0, N0, acc);

591

592

LOOP_UNROLLING(int, i, 0, 1, M0,

{

acc[i].v = 0.f;

})

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

597

const int rhs_z = z * rhs_h;

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

598

int k;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

599

for(k = 0; k <= K - K0; k += K0)

600

{

601

TILE(DATA_TYPE, K0, M0, a);

602

TILE(DATA_TYPE, N0, K0, b);

603

604

LOOP_UNROLLING(int, i, 0, 1, K0,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

615

T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

616

T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

617

#if GPU_ARCH == GPU_ARCH_MIDGARD

618

// For explanation, see mat_mul_native_nt_t

619

TILE(DATA_TYPE, M0, K0, at);

620

TILE(DATA_TYPE, K0, N0, bt);

621

622

LOOP_UNROLLING(int, i, 0, 1, K0,

623

{

624

LOOP_UNROLLING(int, j, 0, 1, M0,

625

{

626

at[j].s[i] = a[i].s[j];

})

})

LOOP_UNROLLING(int, i, 0, 1, N0,

631

{

632

LOOP_UNROLLING(int, j, 0, 1, K0,

633

{

634

bt[j].s[i] = b[i].s[j];

})

})

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);

639

#else // GPU_ARCH == GPU_ARCH_MIDGARD

640

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);

641

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

642

643

lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

644

}

645

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

646

#if K % K0 != 0

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

/* Leftover Loop */

for(; k < K; ++k)

{

TILE(DATA_TYPE, 1, M0, a);

651

TILE(DATA_TYPE, N0, 1, b);

652

653

LOOP_UNROLLING(int, i, 0, 1, 1,

{

a[i].v = 0.f;

})

LOOP_UNROLLING(int, i, 0, 1, N0,

{

b[i].v = 0.f;

})

// Load tile from the lhs/rhs tensors

664

T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);

Ramy Elgammal

2023-03-20 10:19:10 +0000

[diff] [blame]

665

T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

666

667

#if GPU_ARCH == GPU_ARCH_MIDGARD

668

// For explanation, see mat_mul_native_nt_t

669

TILE(DATA_TYPE, M0, 1, at);

670

TILE(DATA_TYPE, 1, N0, bt);

671

672

LOOP_UNROLLING(int, j, 0, 1, M0,

673

{

674

at[j].s[0] = a[0].s[j];

675

})

676

677

LOOP_UNROLLING(int, i, 0, 1, N0,

678

{

679

bt[0].s[i] = b[i].s[0];

680

})

681

682

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);

683

#else // GPU_ARCH == GPU_ARCH_MIDGARD

684

T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);

685

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

686

687

lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;

Gunes Bayir

2023-03-17 13:52:21 +0000

[diff] [blame]

688

}

689

#endif // K % K0 != 0

690

691

const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;

692

const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;

693

694

TILE(int, M0, 1, indirect_buffer);

695

LOOP_UNROLLING(int, _i, 0, 1, M0,

696

{

697

indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));

698

});

699

Mohammed Suhail Munshi

2023-06-27 14:25:58 +0100

[diff] [blame]

700

#ifdef BIAS

701

perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);

702

#endif // defined(BIAS)

703

Mohammed Suhail Munshi

2023-05-25 16:48:43 +0100

[diff] [blame]

704

T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);

705

Gunes Bayir