Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-12-06 17:13:09 +0000

[diff] [blame]

45

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

46

#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \

47

({ \

48

if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \

49

{ \

50

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

51

{ \

52

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

else \

{ \

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

62

{ \

63

LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

})

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

72

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in

73

* the output matrix unrolling the values.

74

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

75

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

76

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

77

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

78

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

79

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

80

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

81

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

82

* @note Only the following values for M0, K0 and V0 are supported:

83

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

84

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

85

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

86

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

87

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

88

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

89

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

90

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

91

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

92

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

93

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

95

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

96

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

97

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

98

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

99

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

100

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

101

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

102

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

103

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

104

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

105

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

106

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

107

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

108

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

109

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

110

*/

111

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

112

TENSOR3D_DECLARATION(dst)

113

#if defined(REINTERPRET_INPUT_AS_3D)

114

,

115

uint cross_plane_pad

116

#endif // REINTERPRET_INPUT_AS_3D

117

)

118

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

119

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

120

#define BLOCK_SIZE ((M0) * (K0))

121

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

122

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

#if defined(INTERLEAVE)

124

#define OUTPUT_OFFSET_X (K0)

125

#else // defined(INTERLEAVE)

126

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

127

#endif // defined(INTERLEAVE)

128

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

129

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

130

#if defined(INTERLEAVE)

131

#define OUTPUT_STEP_X (K0) * (V0)

132

#else // Do not interleave

133

#define OUTPUT_STEP_X (K0)

134

#endif // defined(INTERLEAVE)

135

136

// Compute source and destination addresses

137

uint x = get_global_id(0);

138

uint y = get_global_id(1);

139

uint z = get_global_id(2);

140

141

// ------------------ Compute input/output addresses ---------------------------

142

143

// Compute the input address

144

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

145

146

// Compute the output address

147

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

148

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

149

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

150

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

151

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

152

153

#if defined(REINTERPRET_INPUT_AS_3D)

154

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

155

// multiply src_stride_z by DEPTH_GEMM3D

156

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

158

159

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

160

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

161

162

#else // defined(REINTERPRET_INPUT_AS_3D)

163

164

input_ptr += z * (uint)src_stride_z;

165

166

#endif // defined(REINTERPRET_INPUT_AS_3D)

167

168

// Add offset for batched GEMM

169

output_ptr += z * (uint)dst_stride_z;

170

171

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

172

// Load values from the LHS matrix

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

173

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

174

175

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

176

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

177

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

178

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

179

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

180

181

#undef BLOCK_SIZE

182

#undef OUTPUT_OFFSET_X

183

#undef OUTPUT_STEP_X

184

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

185

186

#if M0 == 2

187

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

188

({ \

189

VEC_DATA_TYPE(DATA_TYPE, M0) \

190

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

191

VSTORE(M0) \

192

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

193

})

194

#elif M0 == 3 // M0 == 3

195

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

196

({ \

197

VEC_DATA_TYPE(DATA_TYPE, M0) \

198

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

199

VSTORE(M0) \

200

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

201

})

202

#elif M0 == 4 // M0 == 4

203

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

204

({ \

205

VEC_DATA_TYPE(DATA_TYPE, M0) \

206

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

207

VSTORE(M0) \

208

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

209

})

210

#elif M0 == 5 // M0 == 5

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

DATA_TYPE res1 = a4.s##i; \

216

VSTORE(4) \

217

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

218

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

219

})

220

#elif M0 == 6 // M0 == 6

221

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

222

({ \

223

VEC_DATA_TYPE(DATA_TYPE, 4) \

224

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

225

VEC_DATA_TYPE(DATA_TYPE, 2) \

226

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

227

VSTORE(4) \

228

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

229

VSTORE(2) \

230

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

231

})

232

#elif M0 == 7 // M0 == 7

233

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

234

({ \

235

VEC_DATA_TYPE(DATA_TYPE, 4) \

236

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

237

VEC_DATA_TYPE(DATA_TYPE, 3) \

238

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

239

VSTORE(4) \

240

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

VSTORE(3) \

242

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

243

})

244

#elif M0 == 8 // M0 == 8

245

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

246

({ \

247

VEC_DATA_TYPE(DATA_TYPE, M0) \

248

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

249

VSTORE(M0) \

250

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

251

})

252

#else // M0 not supported

253

#error "M0 value not supported"

254

#endif // N0 conditions

255

256

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

257

* the output matrix unrolling the values.

258

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

259

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

260

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

261

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

262

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

263

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

264

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

265

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

266

* @note Only the following values for M0, K0 and V0 are supported:

267

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

268

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

269

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

270

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

271

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

272

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

273

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

274

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

275

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

276

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

277

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

278

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

279

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

280

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

281

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

282

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

283

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

284

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

285

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

286

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

287

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

288

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

289

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

290

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

291

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

292

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

293

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

294

*/

295

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

296

TENSOR3D_DECLARATION(dst)

297

#if defined(REINTERPRET_INPUT_AS_3D)

298

,

299

uint cross_plane_pad

300

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

305

306

// Output offset X

307

#if defined(INTERLEAVE)

308

#define OUTPUT_OFFSET_X (M0)

309

#else // defined(INTERLEAVE)

310

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

311

#endif // defined(INTERLEAVE)

312

313

// Output step X

314

#if defined(INTERLEAVE)

315

#define OUTPUT_STEP_X (M0) * (V0)

316

#else // Do not interleave

317

#define OUTPUT_STEP_X (M0)

318

#endif // defined(INTERLEAVE)

319

320

// Compute source and destination addresses

321

uint x = get_global_id(0);

322

uint y = get_global_id(1);

323

uint z = get_global_id(2);

324

325

// ------------------ Compute input/output addresses ---------------------------

326

327

// Compute the input address

328

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

329

330

// Compute the output address

331

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

332

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

333

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

334

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

335

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

336

337

#if defined(REINTERPRET_INPUT_AS_3D)

338

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

339

// multiply src_stride_z by DEPTH_GEMM3D

340

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

341

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

342

343

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

344

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

345

346

#else // defined(REINTERPRET_INPUT_AS_3D)

347

348

input_ptr += z * (uint)src_stride_z;

349

350

#endif // defined(REINTERPRET_INPUT_AS_3D)

351

352

// Add offset for batched GEMM

353

output_ptr += z * (uint)dst_stride_z;

354

355

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

356

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

357

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

358

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

359

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

360

// ---------------------------Transpose and store block -----------------------

361

362

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

363

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

364

#if K0 > 2

365

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

366

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

367

#if K0 > 3

368

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

369

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

370

#if K0 > 4

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

372

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

374

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

375

#endif // K0 > 4

376

#if K0 > 8

377

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

378

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

383

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

384

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

389

#undef OUTPUT_STEP_X

390

}

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

391

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

392

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

393

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

394

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

395

* the output matrix unrolling the values.

396

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

397

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

398

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

399

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

400

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

402

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

403

* N0: 2,3,4,8,16

404

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

405

* H0: greater than 0

406

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

407

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

408

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

409

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

410

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

411

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

412

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

413

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

414

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

415

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

416

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

417

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

419

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

421

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

423

*/

424

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

425

TENSOR3D_DECLARATION(dst))

426

{

427

// Block size

428

#define BLOCK_SIZE ((K0) * (N0))

429

430

// Output offset X

431

#if defined(INTERLEAVE)

432

#define OUTPUT_OFFSET_X (N0)

433

#else // defined(INTERLEAVE)

434

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

435

#endif // defined(INTERLEAVE)

436

437

// Output step X

438

#if defined(INTERLEAVE)

439

#define OUTPUT_STEP_X (N0) * (H0)

440

#else // Do not interleave

441

#define OUTPUT_STEP_X (N0)

442

#endif // defined(INTERLEAVE)

443

444

// Compute source and destination addresses

445

uint x = get_global_id(0);

446

uint y = get_global_id(1);

447

uint z = get_global_id(2);

448

449

// ------------------ Compute input/output addresses ---------------------------

450

451

// Compute the input address

452

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

453

454

// Compute the output address

455

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

456

x / (uint)H0)

457

* (uint)dst_stride_y)

458

+ z * (uint)dst_stride_z;

459

460

// ---------------------------Load input values --------------------------------

461

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

462

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

463

464

// Load values from the RHS matrix

465

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

466

#if K0 > 1

467

if(y * (uint)K0 + 1 < SRC_HEIGHT)

468

{

469

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

474

{

475

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

476

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

477

#endif // K0 > 2

478

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

479

if(y * (uint)K0 + 3 < SRC_HEIGHT)

480

{

481

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

482

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

483

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

484

#if K0 > 4

485

if(y * (uint)K0 + 4 < SRC_HEIGHT)

486

{

487

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

488

}

489

if(y * (uint)K0 + 5 < SRC_HEIGHT)

490

{

491

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

492

}

493

if(y * (uint)K0 + 6 < SRC_HEIGHT)

494

{

495

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

496

}

497

if(y * (uint)K0 + 7 < SRC_HEIGHT)

498

{

499

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

500

}

501

#endif // K0 > 4

502

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

503

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

504

{

505

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

506

}

507

if(y * (uint)K0 + 9 < SRC_HEIGHT)

508

{

509

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

510

}

511

if(y * (uint)K0 + 10 < SRC_HEIGHT)

512

{

513

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

514

}

515

if(y * (uint)K0 + 11 < SRC_HEIGHT)

516

{

517

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

518

}

519

if(y * (uint)K0 + 12 < SRC_HEIGHT)

520

{

521

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

522

}

523

if(y * (uint)K0 + 13 < SRC_HEIGHT)

524

{

525

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

526

}

527

if(y * (uint)K0 + 14 < SRC_HEIGHT)

528

{

529

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

530

}

531

if(y * (uint)K0 + 15 < SRC_HEIGHT)

532

{

533

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

538

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

539

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

540

541

#undef BLOCK_SIZE

542

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

547

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

548

* the output matrix unrolling the values.

549

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

550

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

551

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

552

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

553

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

554

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

555

* @note The option -DTRANSPOSE must passed at compile time.

556

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

557

* N0: 2,3,4,8,16

558

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

559

* H0: greater than 0

560

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

561

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

563

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

564

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

565

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

566

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

567

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

568

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

569

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

575

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

577

*/

578

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

579

TENSOR3D_DECLARATION(dst))

580

{

581

// Block size

582

#define BLOCK_SIZE ((K0) * (N0))

583

584

// Output offset X

585

#if defined(INTERLEAVE)

586

#define OUTPUT_OFFSET_X (K0)

587

#else // defined(INTERLEAVE)

588

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

589

#endif // defined(INTERLEAVE)

590

591

// Output step X

592

#if defined(INTERLEAVE)

593

#define OUTPUT_STEP_X (K0) * (H0)

594

#else // Do not interleave

595

#define OUTPUT_STEP_X (K0)

596

#endif // defined(INTERLEAVE)

597

598

// Compute source and destination addresses

599

uint x = get_global_id(0);

600

uint y = get_global_id(1);

601

uint z = get_global_id(2);

602

603

// ------------------ Compute input/output addresses ---------------------------

604

605

// Compute the input address

606

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

607

608

// Compute the output address

609

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

610

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

611

612

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

613

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

614

615

// Load values from the RHS matrix

616

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

617

if(y * (uint)K0 + 1 < SRC_HEIGHT)

618

{

619

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

620

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

621

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

if(y * (uint)K0 + 2 < SRC_HEIGHT)

623

{

624

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

625

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

626

#endif // K0 > 2

627

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

628

if(y * (uint)K0 + 3 < SRC_HEIGHT)

629

{

630

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

631

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

632

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

633

#if K0 > 4

634

if(y * (uint)K0 + 4 < SRC_HEIGHT)

635

{

636

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

637

}

638

if(y * (uint)K0 + 5 < SRC_HEIGHT)

639

{

640

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

641

}

642

if(y * (uint)K0 + 6 < SRC_HEIGHT)

643

{

644

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

645

}

646

if(y * (uint)K0 + 7 < SRC_HEIGHT)

647

{

648

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

649

}

650

#endif // K0 > 4

651

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

652

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

653

{

654

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

655

}

656

if(y * (uint)K0 + 9 < SRC_HEIGHT)

657

{

658

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

659

}

660

if(y * (uint)K0 + 10 < SRC_HEIGHT)

661

{

662

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

663

}

664

if(y * (uint)K0 + 11 < SRC_HEIGHT)

665

{

666

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

667

}

668

if(y * (uint)K0 + 12 < SRC_HEIGHT)

669

{

670

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

671

}

672

if(y * (uint)K0 + 13 < SRC_HEIGHT)

673

{

674

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

675

}

676

if(y * (uint)K0 + 14 < SRC_HEIGHT)

677

{

678

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

679

}

680

if(y * (uint)K0 + 15 < SRC_HEIGHT)

681

{

682

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

687

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

688

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

689

#if K0 == 2

690

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

696

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

697

#if N0 > 2

698

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

699

#endif // N0 > 2

700

#if N0 > 3

701

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

702

#endif // N0 > 3

703

#if N0 > 4

704

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

705

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

706

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

707

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

708

#endif // N0 > 4

709

#if N0 > 8

710

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

711

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

712

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

713

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

714

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

715

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

716

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

717

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

718

#endif // N0 > 8

719

720

#elif K0 == 3 // K0 == 2

721

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

726

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

727

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

728

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

729

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

730

#endif // N0 > 2

731

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

732

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

733

#endif // N0 > 3

734

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

735

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

736

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

737

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

738

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

739

#endif // N0 > 4

740

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

741

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

742

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

743

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

744

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

745

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

746

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

747

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

748

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

749

#endif // N0 > 8

750

751

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

752

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

758

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

759

#if N0 > 2

760

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

761

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

762

#if N0 > 3

763

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

764

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

765

#if N0 > 4

766

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

767

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

768

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

769

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

770

#endif // N0 > 4

771

#if N0 > 8

772

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

773

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

774

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

775

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

776

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

777

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

778

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

779

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

780

#endif // N0 > 8

781

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

782

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

783

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

788

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

789

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

790

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

791

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

792

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

793

#if N0 > 3

794

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

795

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

796

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

797

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

798

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

799

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

800

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

801

#endif // N0 > 4

802

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

803

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

804

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

805

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

806

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

807

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

808

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

809

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

810

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

811

#endif // N0 > 8

812

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

813

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

814

815

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

821

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

822

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

823

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

824

#if N0 > 2

825

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

826

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

827

#endif // N0 > 2

828

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

829

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

830

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

831

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

832

#if N0 > 4

833

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

834

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

835

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

836

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

837

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

838

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

839

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

840

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

841

#endif // N0 > 4

842

#if N0 > 8

843

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

844

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

845

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

846

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

847

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

848

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

849

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

850

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

851

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

852

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

853

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

854

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

855

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

856

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

857

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

858

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

863

#endif // N0 > 2

864

865

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

866

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

867

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

868

869

#undef BLOCK_SIZE

870

#undef OUTPUT_OFFSET_X

871

#undef OUTPUT_STEP_X

872

}

873

#endif // defined(TRANSPOSE)

874

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

875

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

876

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

877

878

#define CONCAT(a, b) a##b

879

880

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

885

({ \

886

c = fma(a.s0, b.s0, c); \

887

c = fma(a.s1, b.s1, c); \

888

})

889

#define ARM_DOT3(a, b, c) \

890

({ \

891

ARM_DOT2(a, b, c); \

892

c = fma((a.s2), (b.s2), c); \

893

})

894

#define ARM_DOT4(a, b, c) \

895

({ \

896

ARM_DOT3(a, b, c); \

897

c = fma((a.s3), (b.s3), c); \

898

})

899

#define ARM_DOT8(a, b, c) \

900

({ \

901

ARM_DOT4((a.lo), (b.lo), c); \

902

ARM_DOT4((a.hi), (b.hi), c); \

903

})

904

#define ARM_DOT16(a, b, c) \

905

({ \

906

ARM_DOT8((a.lo), (b.lo), c); \

907

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

912

({ \

913

CONCAT(ARM_DOT, k0) \

914

((a), (b##0), (c.s0)); \

915

CONCAT(ARM_DOT, k0) \

916

((a), (b##1), (c.s1)); \

917

})

918

#elif N0 == 3 // N0 == 3

919

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

CONCAT(ARM_DOT, k0) \

926

((a), (b##2), (c.s2)); \

927

})

928

#elif N0 == 4 // N0 == 4

929

#define ARM_DOT_K0XN0(k0, a, b, c) \

930

({ \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##0), (c.s0)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##1), (c.s1)); \

935

CONCAT(ARM_DOT, k0) \

936

((a), (b##2), (c.s2)); \

937

CONCAT(ARM_DOT, k0) \

938

((a), (b##3), (c.s3)); \

939

})

940

#elif N0 == 8 // N0 == 8

941

#define ARM_DOT_K0XN0(k0, a, b, c) \

942

({ \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##0), (c.s0)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##1), (c.s1)); \

947

CONCAT(ARM_DOT, k0) \

948

((a), (b##2), (c.s2)); \

949

CONCAT(ARM_DOT, k0) \

950

((a), (b##3), (c.s3)); \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##4), (c.s4)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##5), (c.s5)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##6), (c.s6)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##7), (c.s7)); \

959

})

960

#elif N0 == 16 // N0 == 16

961

#define ARM_DOT_K0XN0(k0, a, b, c) \

962

({ \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##0), (c.s0)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##1), (c.s1)); \

967

CONCAT(ARM_DOT, k0) \

968

((a), (b##2), (c.s2)); \

969

CONCAT(ARM_DOT, k0) \

970

((a), (b##3), (c.s3)); \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##4), (c.s4)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##5), (c.s5)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##6), (c.s6)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##7), (c.s7)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##8), (c.s8)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##9), (c.s9)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##A), (c.sA)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##B), (c.sB)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##C), (c.sC)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##D), (c.sD)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##E), (c.sE)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##F), (c.sF)); \

995

})

996

#else // N0 not supported

997

#error "N0 value not supported"

998

#endif // N0 conditions

999

1000

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1001

* The LHS matrix is NOT reshaped

1002

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1003

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1004

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1005

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1006

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1007

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1008

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1009

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1010

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1011

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1012

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1013

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1014

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1015

* - N0 = 2, 3, 4, 8, 16

1016

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1017

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1019

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1020

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1021

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1022

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1023

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1024

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1025

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1026

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1027

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1028

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1029

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1030

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1031

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1032

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1033

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1034

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1035

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1036

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1037

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1038

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1039

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1040

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1042

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1044

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1046

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1047

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1048

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1050

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1052

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1053

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1054

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1055

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1056

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1057

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1058

*/

1059

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1060

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

#if defined(BETA)

1062

IMAGE_DECLARATION(bias),

1063

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

IMAGE_DECLARATION(dst),

1065

uint lhs_stride_z,

1066

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

uint bias_stride_z,

1069

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

uint dst_stride_z

1071

#if defined(REINTERPRET_INPUT_AS_3D)

1072

,

1073

uint lhs_cross_plane_pad

1074

#endif // REINTERPRET_INPUT_AS_3D

1075

#if defined(REINTERPRET_OUTPUT_AS_3D)

1076

,

1077

uint dst_cross_plane_pad

1078

#endif // REINTERPRET_OUTPUT_AS_3D

1079

)

1080

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1081

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1082

#define RHS_BLOCK_SIZE ((K0) * (N0))

1083

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1084

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1085

#if defined(RHS_INTERLEAVE)

1086

#define RHS_OFFSET_X (K0)

1087

#define RHS_STEP_X ((K0) * (H0))

1088

#define RHS_STEP_LOOP (1)

1089

#else // defined(RHS_INTERLEAVE)

1090

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1091

#define RHS_STEP_X (K0)

1092

#define RHS_STEP_LOOP (H0)

1093

#endif // defined(RHS_INTERLEAVE)

1094

1095

uint x = get_global_id(0);

1096

uint y = get_global_id(1);

1097

uint z = get_global_id(2);

1098

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1099

#if defined(DUMMY_WORK_ITEMS)

1100

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1105

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1106

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1107

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1108

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1109

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1110

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1111

1112

#if defined(MATRIX_B_DEPTH)

1113

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1114

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1115

#else // defined(MATRIX_B_DEPTH)

1116

rhs_offset += z * rhs_stride_z;

1117

#endif // defined(MATRIX_B_DEPTH)

1118

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1119

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1120

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1121

1122

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1123

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1124

CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1125

1126

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1127

// multiply lhs_stride_z by DEPTH_GEMM3D

1128

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1129

1130

#else // defined(REINTERPRET_INPUT_AS_3D)

1131

1132

// Add offset for batched GEMM

1133

lhs_offset += z * lhs_stride_z;

1134

1135

#endif // defined(REINTERPRET_INPUT_AS_3D)

1136

1137

// Initialize the accumulators

1138

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1139

1140

int i = 0;

1141

for(; i <= (K - K0); i += K0)

1142

{

1143

// Supported cases (M0, K0):

1144

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1145

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1146

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1147

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1148

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1149

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1150

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1151

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1152

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1153

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1154

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1155

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1156

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1157

1158

// Accumulate

1159

ARM_DOT_K0XN0(K0, a0, b, c0);

1160

#if M0 > 1

1161

ARM_DOT_K0XN0(K0, a1, b, c1);

1162

#endif // M0 > 1

1163

#if M0 > 2

1164

ARM_DOT_K0XN0(K0, a2, b, c2);

1165

#endif // M0 > 2

1166

#if M0 > 3

1167

ARM_DOT_K0XN0(K0, a3, b, c3);

1168

#endif // M0 > 3

1169

#if M0 > 4

1170

ARM_DOT_K0XN0(K0, a4, b, c4);

1171

#endif // M0 > 4

1172

#if M0 > 5

1173

ARM_DOT_K0XN0(K0, a5, b, c5);

1174

#endif // M0 > 5

1175

#if M0 > 6

1176

ARM_DOT_K0XN0(K0, a6, b, c6);

1177

#endif // M0 > 6

1178

#if M0 > 7

1179

ARM_DOT_K0XN0(K0, a7, b, c7);

1180

#endif // M0 > 7

1181

1182

lhs_offset += K0 * sizeof(DATA_TYPE);

1183

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1184

}

1185

1186

// Left-over accumulations

1187

for(; i < K; ++i)

1188

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1189

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1190

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1191

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1192

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1193

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1194

1195

// Accumulate

1196

ARM_DOT_K0XN0(1, a0, b, c0);

1197

#if M0 > 1

1198

ARM_DOT_K0XN0(1, a1, b, c1);

1199

#endif // M0 > 1

1200

#if M0 > 2

1201

ARM_DOT_K0XN0(1, a2, b, c2);

1202

#endif // M0 > 2

1203

#if M0 > 3

1204

ARM_DOT_K0XN0(1, a3, b, c3);

1205

#endif // M0 > 3

1206

#if M0 > 4

1207

ARM_DOT_K0XN0(1, a4, b, c4);

1208

#endif // M0 > 4

1209

#if M0 > 5

1210

ARM_DOT_K0XN0(1, a5, b, c5);

1211

#endif // M0 > 5

1212

#if M0 > 6

1213

ARM_DOT_K0XN0(1, a6, b, c6);

1214

#endif // M0 > 6

1215

#if M0 > 7

1216

ARM_DOT_K0XN0(1, a7, b, c7);

1217

#endif // M0 > 7

1218

1219

lhs_offset += sizeof(DATA_TYPE);

1220

rhs_offset += sizeof(DATA_TYPE);

1221

}

1222

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1223

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1224

1225

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1226

1227

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1228

1229

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1230

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1231

1232

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1233

// multiply dst_stride_z by DEPTH_GEMM3D

1234

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1235

1236

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1237

1238

// Add offset for batched GEMM

1239

dst_addr += z * dst_stride_z;

1240

1241

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1242

1243

// Multiply by the weight of matrix-matrix product and store the result

1244

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1245

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1246

#endif // defined(ALPHA)

1247

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1248

// Add beta*bias

1249

#if defined(BETA)

1250

#if defined(BROADCAST_BIAS)

1251

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1252

1253

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1254

1255

#ifndef UNIT_BETA

1256

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1257

#endif // UNIT_BIAS

1258

1259

// c = c + bias[broadcasted]

1260

ADD_BLOCK_BROADCAST(M0, c, bias0);

1261

1262

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1263

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1264

1265

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1266

1267

#ifndef UNIT_BETA

1268

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1273

1274

#endif // defined(BROADCAST_BIAS)

1275

#endif // defined(BETA)

1276

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1277

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1278

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1279

#endif // defined(ACTIVATION_TYPE)

1280

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1281

const bool cond_y = y == 0;

1282

const bool cond_x = ((x + 1) * N0 >= N);

1283

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1284

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

1285

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1286

1287

#undef RHS_BLOCK_SIZE

1288

#undef RHS_OFFSET_X

1289

#undef RHS_STEP_X

1290

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1291

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1292

#if defined(OPENCL_IMAGE_SUPPORT)

1293

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1294

* The LHS matrix is NOT reshaped

1295

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1296

*

1297

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1298

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1299

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1300

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1301

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1302

* could be different from the value returned by get_image_height(rhs_img).

1303

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1304

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1305

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1306

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1307

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1308

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1309

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1310

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1316

* The activation function is performed after the bias addition

1317

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1318

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1319

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1320

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1321

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1322

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1323

*

1324

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1325

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1326

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1327

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1328

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1329

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1330

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1331

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1332

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1333

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1334

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1335

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1336

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1337

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1338

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1339

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1340

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1341

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1342

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1343

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1344

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1345

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1346

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1347

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1348

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1349

*/

1350

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1351

__read_only image2d_t rhs_img,

1352

#if defined(BETA)

1353

IMAGE_DECLARATION(bias),

1354

#endif // defined(BETA)

1355

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1361

uint dst_stride_z

1362

#if defined(REINTERPRET_INPUT_AS_3D)

1363

,

1364

uint lhs_cross_plane_pad

1365

#endif // REINTERPRET_INPUT_AS_3D

1366

#if defined(REINTERPRET_OUTPUT_AS_3D)

1367

,

1368

uint dst_cross_plane_pad

1369

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1374

1375

#define LEFTOVER_K (K % K0)

1376

1377

// Block size

1378

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1379

1380

// RHS offset and step X

1381

#if defined(RHS_INTERLEAVE)

1382

#define RHS_OFFSET_X (PIXEL_UNIT)

1383

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1384

#define RHS_STEP_LOOP (1)

1385

#else // defined(RHS_INTERLEAVE)

1386

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1387

#define RHS_STEP_X PIXEL_UNIT

1388

#define RHS_STEP_LOOP (H0)

1389

#endif // defined(RHS_INTERLEAVE)

1390

1391

uint x = get_global_id(0);

1392

uint y = get_global_id(1);

1393

uint z = get_global_id(2);

1394

1395

#if defined(DUMMY_WORK_ITEMS)

1396

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1401

1402

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1403

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1404

1405

#if defined(MATRIX_B_DEPTH)

1406

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1407

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1408

#else // defined(MATRIX_B_DEPTH)

1409

const uint z_rhs = get_global_id(2);

1410

#endif // defined(MATRIX_B_DEPTH)

1411

1412

// Compute RHS matrix coordinates

1413

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1414

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1415

1416

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1417

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1418

1419

#if defined(REINTERPRET_INPUT_AS_3D)

1420

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1421

CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1422

1423

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1424

// multiply lhs_stride_z by DEPTH_GEMM3D

1425

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1426

1427

#else // defined(REINTERPRET_INPUT_AS_3D)

1428

1429

// Add offset for batched GEMM

1430

lhs_offset += z * lhs_stride_z;

1431

1432

#endif // defined(REINTERPRET_INPUT_AS_3D)

1433

1434

// Initialize the accumulators

1435

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1436

1437

int i = 0;

1438

for(; i <= (K - K0); i += K0)

1439

{

1440

// Load values from LHS matrix

1441

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1442

1443

// Load values from RHS matrix stored in a cl_image

1444

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1445

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1446

1447

// Accumulate

1448

ARM_DOT_K0XN0(K0, a0, b, c0);

1449

#if M0 > 1

1450

ARM_DOT_K0XN0(K0, a1, b, c1);

1451

#endif // M0 > 1

1452

#if M0 > 2

1453

ARM_DOT_K0XN0(K0, a2, b, c2);

1454

#endif // M0 > 2

1455

#if M0 > 3

1456

ARM_DOT_K0XN0(K0, a3, b, c3);

1457

#endif // M0 > 3

1458

#if M0 > 4

1459

ARM_DOT_K0XN0(K0, a4, b, c4);

1460

#endif // M0 > 4

1461

#if M0 > 5

1462

ARM_DOT_K0XN0(K0, a5, b, c5);

1463

#endif // M0 > 5

1464

#if M0 > 6

1465

ARM_DOT_K0XN0(K0, a6, b, c6);

1466

#endif // M0 > 6

1467

#if M0 > 7

1468

ARM_DOT_K0XN0(K0, a7, b, c7);

1469

#endif // M0 > 7

1470

1471

lhs_offset += K0 * sizeof(DATA_TYPE);

1472

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1477

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1487

#if M0 > 1

1488

union UNION_VEC_TYPE a1 = {.v = 0 };

1489

#endif // M0 > 1

1490

#if M0 > 2

1491

union UNION_VEC_TYPE a2 = {.v = 0 };

1492

#endif // M0 > 2

1493

#if M0 > 3

1494

union UNION_VEC_TYPE a3 = {.v = 0 };

1495

#endif // M0 > 3

1496

#if M0 > 4

1497

union UNION_VEC_TYPE a4 = {.v = 0 };

1498

#endif // M0 > 4

1499

#if M0 > 5

1500

union UNION_VEC_TYPE a5 = {.v = 0 };

1501

#endif // M0 > 5

1502

#if M0 > 6

1503

union UNION_VEC_TYPE a6 = {.v = 0 };

1504

#endif // M0 > 6

1505

#if M0 > 7

1506

union UNION_VEC_TYPE a7 = {.v = 0 };

1507

#endif // M0 > 7

1508

1509

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1510

1511

// Load from RHS matrix

1512

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1513

1514

// Load from LHS matrix

1515

for(int k = 0; k < LEFTOVER_K; ++k)

1516

{

1517

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1518

#if M0 > 1

1519

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1520

#endif // M0 > 1

1521

#if M0 > 2

1522

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1523

#endif // M0 > 2

1524

#if M0 > 3

1525

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1526

#endif // M0 > 3

1527

#if M0 > 4

1528

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1529

#endif // M0 > 4

1530

#if M0 > 5

1531

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1532

#endif // M0 > 5

1533

#if M0 > 6

1534

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1535

#endif // M0 > 6

1536

#if M0 > 7

1537

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1538

#endif // M0 > 7

1539

1540

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1545

#if M0 > 1

1546

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1547

#endif // M0 > 1

1548

#if M0 > 2

1549

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1550

#endif // M0 > 2

1551

#if M0 > 3

1552

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1553

#endif // M0 > 3

1554

#if M0 > 4

1555

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1556

#endif // M0 > 4

1557

#if M0 > 5

1558

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1559

#endif // M0 > 5

1560

#if M0 > 6

1561

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1562

#endif // M0 > 6

1563

#if M0 > 7

1564

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1565

#endif // M0 > 7

1566

1567

#endif // LEFTOVER_K != 0

1568

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1569

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1570

1571

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1572

1573

#if defined(REINTERPRET_OUTPUT_AS_3D)

1574

1575

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1576

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1577

1578

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1579

// multiply dst_stride_z by DEPTH_GEMM3D

1580

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1581

1582

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1583

1584

// Add offset for batched GEMM

1585

dst_addr += z * dst_stride_z;

1586

1587

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1588

1589

// Multiply by the weight of matrix-matrix product and store the result

1590

#if defined(ALPHA)

1591

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1592

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1597

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1598

1599

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1600

1601

#ifndef UNIT_BETA

1602

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1603

#endif // UNIT_BIAS

1604

1605

// c = c + bias[broadcasted]

1606

ADD_BLOCK_BROADCAST(M0, c, bias0);

1607

1608

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1609

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1610

1611

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1612

1613

#ifndef UNIT_BETA

1614

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1619

1620

#endif // defined(BROADCAST_BIAS)

1621

#endif // defined(BETA)

1622

1623

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1624

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1625

#endif // defined(ACTIVATION_TYPE)

1626

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1627

const bool cond_y = y == 0;

1628

const bool cond_x = ((x + 1) * N0 >= N);

1629

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1630

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

1631

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1632

1633

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1640

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1641

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1647

#define VFMA_M0xN0(i, a, b, c) \

1648

({ \

1649

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1650

})

1651

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1652

#define VFMA_M0xN0(i, a, b, c) \

1653

({ \

1654

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1655

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1656

})

1657

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1658

#define VFMA_M0xN0(i, a, b, c) \

1659

({ \

1660

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1661

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1663

})

1664

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1665

#define VFMA_M0xN0(i, a, b, c) \

1666

({ \

1667

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1668

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1669

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1670

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1671

})

1672

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1673

#define VFMA_M0xN0(i, a, b, c) \

1674

({ \

1675

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1676

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1677

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1678

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1680

})

1681

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1682

#define VFMA_M0xN0(i, a, b, c) \

1683

({ \

1684

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1685

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1686

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1687

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1690

})

1691

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1692

#define VFMA_M0xN0(i, a, b, c) \

1693

({ \

1694

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1695

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1696

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1697

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1698

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1701

})

1702

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1703

#define VFMA_M0xN0(i, a, b, c) \

1704

({ \

1705

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1706

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1707

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1708

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1709

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1710

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1711

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1712

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1713

})

1714

#else // M0 not supported

1715

#error "M0 not supported"

1716

#endif // M0 not supported

1717

1718

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1719

* The LHS matrix is NOT reshaped

1720

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1721

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1722

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1723

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1724

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1725

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1726

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1727

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1728

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1729

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1730

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1731

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1732

* - N0 = 2, 3, 4, 8, 16

1733

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1734

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1735

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1736

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1737

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1738

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1739

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1740

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1741

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1742

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1743

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1744

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1745

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1746

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1747

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1748

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1749

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1750

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1751

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1752

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1753

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1754

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1755

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1756

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1757

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1758

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1759

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1760

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1761

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1762

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1763

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1764

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1765

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1766

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1767

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1768

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1769

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1770

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1771

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1772

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1773

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1774

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1775

*/

1776

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1777

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1778

#if defined(BETA)

1779

IMAGE_DECLARATION(bias),

1780

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1781

IMAGE_DECLARATION(dst),

1782

uint lhs_stride_z,

1783

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1784

#if defined(BETA)

1785

uint bias_stride_z,

1786

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1787

uint dst_stride_z

1788

#if defined(REINTERPRET_INPUT_AS_3D)

1789

,

1790

uint lhs_cross_plane_pad

1791

#endif // REINTERPRET_INPUT_AS_3D

1792

#if defined(REINTERPRET_OUTPUT_AS_3D)

1793

,

1794

uint dst_cross_plane_pad

1795

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1800

1801

// RHS offset and step X

1802

#if defined(RHS_INTERLEAVE)

1803

#define RHS_OFFSET_X (N0)

1804

#define RHS_STEP_X ((N0) * (H0))

1805

#define RHS_STEP_LOOP (1)

1806

#else // defined(RHS_INTERLEAVE)

1807

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1808

#define RHS_STEP_X (N0)

1809

#define RHS_STEP_LOOP (H0)

1810

#endif // defined(RHS_INTERLEAVE)

1811

1812

uint x = get_global_id(0);

1813

uint y = get_global_id(1);

1814

uint z = get_global_id(2);

1815

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1816

#if defined(DUMMY_WORK_ITEMS)

1817

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1822

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1823

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1824

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1825

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1826

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1827

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1828

1829

#if defined(MATRIX_B_DEPTH)

1830

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1831

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1832

#else // defined(MATRIX_B_DEPTH)

1833

rhs_offset += z * rhs_stride_z;

1834

#endif // defined(MATRIX_B_DEPTH)

1835

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1836

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1837

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1838

1839

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1840

1841

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1842

CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1843

1844

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1845

// multiply lhs_stride_z by DEPTH_GEMM3D

1846

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1847

1848

#else // defined(REINTERPRET_INPUT_AS_3D)

1849

1850

// Add offset for batched GEMM

1851

lhs_offset += z * lhs_stride_z;

1852

1853

#endif // defined(REINTERPRET_INPUT_AS_3D)

1854

1855

// Initialize the accumulators

1856

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1857

1858

int i = 0;

1859

for(; i <= (K - K0); i += K0)

1860

{

1861

// Supported cases (M0, K0):

1862

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1863

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1864

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1865

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1866

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1867

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1868

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1869

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1870

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1871

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1872

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1873

VEC_DATA_TYPE(DATA_TYPE, N0)

1874

b0;

1875

1876

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1877

VFMA_M0xN0(0, a, b0, c);

1878

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1879

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1880

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1881

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1882

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1883

#endif // K0 > 2

1884

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1885

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1886

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1887

#endif // K0 > 3

1888

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1889

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1890

VFMA_M0xN0(4, a, b0, c);

1891

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1892

VFMA_M0xN0(5, a, b0, c);

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(6, a, b0, c);

1895

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1896

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1897

#endif // K0 > 4

1898

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1899

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1900

VFMA_M0xN0(8, a, b0, c);

1901

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1902

VFMA_M0xN0(9, a, b0, c);

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(A, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(B, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(C, a, b0, c);

1909

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1910

VFMA_M0xN0(D, a, b0, c);

1911

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1912

VFMA_M0xN0(E, a, b0, c);

1913

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1914

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1915

#endif // K0 > 8

1916

1917

lhs_offset += K0 * sizeof(DATA_TYPE);

1918

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1919

}

1920

1921

// Left-over accumulations

1922

for(; i < K; ++i)

1923

{

1924

// Load values from LHS matrix

1925

VEC_DATA_TYPE(DATA_TYPE, 2)

1926

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1927

#if M0 > 1

1928

VEC_DATA_TYPE(DATA_TYPE, 2)

1929

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1930

#endif // M0 > 1

1931

#if M0 > 2

1932

VEC_DATA_TYPE(DATA_TYPE, 2)

1933

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1934

#endif // M0 > 2

1935

#if M0 > 3

1936

VEC_DATA_TYPE(DATA_TYPE, 2)

1937

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1938

#endif // M0 > 3

1939

#if M0 > 4

1940

VEC_DATA_TYPE(DATA_TYPE, 2)

1941

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1942

#endif // M0 > 4

1943

#if M0 > 5

1944

VEC_DATA_TYPE(DATA_TYPE, 2)

1945

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1946

#endif // M0 > 5

1947

#if M0 > 6

1948

VEC_DATA_TYPE(DATA_TYPE, 2)

1949

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1950

#endif // M0 > 6

1951

#if M0 > 7

1952

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1953

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1954

#endif // M0 > 7

1955

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1956

VEC_DATA_TYPE(DATA_TYPE, N0)

1957

b0;

1958

1959

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1960

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1961

1962

lhs_offset += sizeof(DATA_TYPE);

1963

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1964

}

1965

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1966

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1967

1968

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1969

1970

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1971

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

1972

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1973

1974

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1975

// multiply dst_stride_z by DEPTH_GEMM3D

1976

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1977

1978

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1979

1980

// Add offset for batched GEMM

1981

dst_addr += z * dst_stride_z;

1982

1983

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1984

1985

// Multiply by the weight of matrix-matrix product and store the result

1986

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1987

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1988

#endif // defined(ALPHA)

1989

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1990

// Add beta*bias

1991

#if defined(BETA)

1992

#if defined(BROADCAST_BIAS)

1993

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1994

1995

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1996

1997

#ifndef UNIT_BETA

1998

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1999

#endif // UNIT_BIAS

2000

2001

// c = c + bias[broadcasted]

2002

ADD_BLOCK_BROADCAST(M0, c, bias0);

2003

2004

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2005

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

2006

2007

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2008

2009

#ifndef UNIT_BETA

2010

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2015

2016

#endif // defined(BROADCAST_BIAS)

2017

#endif // defined(BETA)

2018

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2019

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2020

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2021

#endif // defined(ACTIVATION_TYPE)

2022

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2023

const bool cond_y = y == 0;

2024

const bool cond_x = ((x + 1) * N0 >= N);

2025

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2026

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2027

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2028

2029

#undef RHS_BLOCK_SIZE

2030

#undef RHS_OFFSET_X

2031

#undef RHS_STEP_X

2032

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2033

2034

#if defined(OPENCL_IMAGE_SUPPORT)

2035

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2036

* The LHS matrix is NOT reshaped

2037

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2038

*

2039

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2040

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2041

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2042

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2043

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2044

* could be different from the value returned by get_image_height(rhs_img).

2045

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2046

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2047

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2048

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2049

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2050

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2051

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2052

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2058

* The activation function is performed after the bias addition

2059

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2060

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2061

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2062

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2063

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2064

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2065

*

2066

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2067

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2068

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2069

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2070

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2071

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2072

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2073

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2074

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2075

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2076

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2077

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2078

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2079

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2080

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2081

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2082

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2083

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2084

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2085

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2086

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2087

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2088

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2089

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2090

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2091

*/

2092

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2093

__read_only image2d_t rhs_img,

2094

#if defined(BETA)

2095

IMAGE_DECLARATION(bias),

2096

#endif // defined(BETA)

2097

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2103

uint dst_stride_z

2104

#if defined(REINTERPRET_INPUT_AS_3D)

2105

,

2106

uint lhs_cross_plane_pad

2107

#endif // REINTERPRET_INPUT_AS_3D

2108

#if defined(REINTERPRET_OUTPUT_AS_3D)

2109

,

2110

uint dst_cross_plane_pad

2111

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2116

2117

// Block size

2118

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2119

2120

// RHS offset and step X

2121

#if defined(RHS_INTERLEAVE)

2122

#define RHS_OFFSET_X (PIXEL_UNIT)

2123

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2124

#else // defined(RHS_INTERLEAVE)

2125

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2126

#define RHS_STEP_X (PIXEL_UNIT)

2127

#endif // defined(RHS_INTERLEAVE)

2128

2129

uint x = get_global_id(0);

2130

uint y = get_global_id(1);

2131

uint z = get_global_id(2);

2132

2133

#if defined(DUMMY_WORK_ITEMS)

2134

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2139

2140

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2141

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2142

2143

#if defined(MATRIX_B_DEPTH)

2144

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2145

const uint z_rhs = (z % MATRIX_B_DEPTH);

2146

#else // defined(MATRIX_B_DEPTH)

2147

const uint z_rhs = z;

2148

#endif // defined(MATRIX_B_DEPTH)

2149

2150

// Compute RHS matrix coordinates

2151

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2152

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2153

2154

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2155

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2156

2157

#if defined(REINTERPRET_INPUT_AS_3D)

2158

2159

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

2160

CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2161

2162

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2163

// multiply lhs_stride_z by DEPTH_GEMM3D

2164

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2165

2166

#else // defined(REINTERPRET_INPUT_AS_3D)

2167

2168

// Add offset for batched GEMM

2169

lhs_offset += z * lhs_stride_z;

2170

2171

#endif // defined(REINTERPRET_INPUT_AS_3D)

2172

2173

// Initialize the accumulators

2174

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2175

2176

int i = 0;

2177

for(; i <= (K - K0); i += K0)

2178

{

2179

// Load values from LHS matrix

2180

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2181

2182

VEC_DATA_TYPE(DATA_TYPE, N0)

2183

b0;

2184

2185

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2186

VFMA_M0xN0(0, a, b0, c);

2187

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2188

VFMA_M0xN0(1, a, b0, c);

2189

#if K0 > 2

2190

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2191

VFMA_M0xN0(2, a, b0, c);

2192

#endif // K0 > 2

2193

#if K0 > 3

2194

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2195

VFMA_M0xN0(3, a, b0, c);

2196

#endif // K0 > 3

2197

#if K0 > 4

2198

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2199

VFMA_M0xN0(4, a, b0, c);

2200

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2201

VFMA_M0xN0(5, a, b0, c);

2202

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2203

VFMA_M0xN0(6, a, b0, c);

2204

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2205

VFMA_M0xN0(7, a, b0, c);

2206

#endif // K0 > 4

2207

#if K0 > 8

2208

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2209

VFMA_M0xN0(8, a, b0, c);

2210

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2211

VFMA_M0xN0(9, a, b0, c);

2212

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2213

VFMA_M0xN0(A, a, b0, c);

2214

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2215

VFMA_M0xN0(B, a, b0, c);

2216

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2217

VFMA_M0xN0(C, a, b0, c);

2218

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2219

VFMA_M0xN0(D, a, b0, c);

2220

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2221

VFMA_M0xN0(E, a, b0, c);

2222

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2223

VFMA_M0xN0(F, a, b0, c);

2224

#endif // K0 > 8

2225

2226

lhs_offset += K0 * sizeof(DATA_TYPE);

2227

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2228

}

2229

2230

// Left-over accumulations

2231

for(; i < K; ++i)

2232

{

2233

// Load values from LHS matrix

2234

VEC_DATA_TYPE(DATA_TYPE, 2)

2235

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2236

#if M0 > 1

2237

VEC_DATA_TYPE(DATA_TYPE, 2)

2238

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2239

#endif // M0 > 1

2240

#if M0 > 2

2241

VEC_DATA_TYPE(DATA_TYPE, 2)

2242

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2243

#endif // M0 > 2

2244

#if M0 > 3

2245

VEC_DATA_TYPE(DATA_TYPE, 2)

2246

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2247

#endif // M0 > 3

2248

#if M0 > 4

2249

VEC_DATA_TYPE(DATA_TYPE, 2)

2250

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2251

#endif // M0 > 4

2252

#if M0 > 5

2253

VEC_DATA_TYPE(DATA_TYPE, 2)

2254

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2255

#endif // M0 > 5

2256

#if M0 > 6

2257

VEC_DATA_TYPE(DATA_TYPE, 2)

2258

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2259

#endif // M0 > 6

2260

#if M0 > 7

2261

VEC_DATA_TYPE(DATA_TYPE, 2)

2262

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2263

#endif // M0 > 7

2264

2265

VEC_DATA_TYPE(DATA_TYPE, N0)

2266

b0;

2267

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2268

2269

VFMA_M0xN0(0, a, b0, c);

2270

2271

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2275

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2276

2277

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2278

2279

#if defined(REINTERPRET_OUTPUT_AS_3D)

2280

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

2281

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2282

2283

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2284

// multiply dst_stride_z by DEPTH_GEMM3D

2285

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2286

2287

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2288

2289

// Add offset for batched GEMM

2290

dst_addr += z * dst_stride_z;

2291

2292

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2293

2294

// Multiply by the weight of matrix-matrix product and store the result

2295

#if defined(ALPHA)

2296

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2297

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2302

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2303

2304

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2305

2306

#ifndef UNIT_BETA

2307

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2308

#endif // UNIT_BIAS

2309

2310

// c = c + bias[broadcasted]

2311

ADD_BLOCK_BROADCAST(M0, c, bias0);

2312

2313

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2314

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2315

2316

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2317

2318

#ifndef UNIT_BETA

2319

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2324

2325

#endif // defined(BROADCAST_BIAS)

2326

#endif // defined(BETA)

2327

2328

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2329

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2330

#endif // defined(ACTIVATION_TYPE)

2331

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2332

const bool cond_y = y == 0;

2333

const bool cond_x = ((x + 1) * N0 >= N);

2334

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2335

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2336

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2337

2338

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2343

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2344

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2345

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2346

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2347

#if defined(MIXED_PRECISION)

2348

#if K0 == 2

2349

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2355

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2362

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2370

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2382

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2402

#error "K0 value not supported"

2403

#endif // K0 conditions

2404

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2405

#if K0 == 2

2406

#define ARM_DOT_K0(a, b, c) \

2407

({ \

2408

c = fma(a.s0, b.s0, c); \

2409

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2410

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2411

#elif K0 == 3 // K0 == 3

2412

#define ARM_DOT_K0(a, b, c) \

2413

({ \

2414

c = fma(a.s0, b.s0, c); \

2415

c = fma(a.s1, b.s1, c); \

2416

c = fma(a.s2, b.s2, c); \

2417

})

2418

#elif K0 == 4 // K0 == 4

2419

#define ARM_DOT_K0(a, b, c) \

2420

({ \

2421

c = fma(a.s0, b.s0, c); \

2422

c = fma(a.s1, b.s1, c); \

2423

c = fma(a.s2, b.s2, c); \

2424

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2425

})

2426

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2427

#define ARM_DOT_K0(a, b, c) \

2428

({ \

2429

c = fma(a.s0, b.s0, c); \

2430

c = fma(a.s1, b.s1, c); \

2431

c = fma(a.s2, b.s2, c); \

2432

c = fma(a.s3, b.s3, c); \

2433

c = fma(a.s4, b.s4, c); \

2434

c = fma(a.s5, b.s5, c); \

2435

c = fma(a.s6, b.s6, c); \

2436

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2437

})

2438

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2439

#define ARM_DOT_K0(a, b, c) \

2440

({ \

2441

c = fma(a.s0, b.s0, c); \

2442

c = fma(a.s1, b.s1, c); \

2443

c = fma(a.s2, b.s2, c); \

2444

c = fma(a.s3, b.s3, c); \

2445

c = fma(a.s4, b.s4, c); \

2446

c = fma(a.s5, b.s5, c); \

2447

c = fma(a.s6, b.s6, c); \

2448

c = fma(a.s7, b.s7, c); \

2449

c = fma(a.s8, b.s8, c); \

2450

c = fma(a.s9, b.s9, c); \

2451

c = fma(a.sA, b.sA, c); \

2452

c = fma(a.sB, b.sB, c); \

2453

c = fma(a.sC, b.sC, c); \

2454

c = fma(a.sD, b.sD, c); \

2455

c = fma(a.sE, b.sE, c); \

2456

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2457

})

2458

#else // K0 not supported

2459

#error "K0 value not supported"

2460

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2461

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2462

2463

#if N0 == 2

2464

#define ARM_DOT_K0XN0(a, b, c) \

2465

({ \

2466

ARM_DOT_K0((a), (b##0), (c.s0)); \

2467

ARM_DOT_K0((a), (b##1), (c.s1)); \

2468

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2469

#elif N0 == 3 // N0 == 3

2470

#define ARM_DOT_K0XN0(a, b, c) \

2471

({ \

2472

ARM_DOT_K0((a), (b##0), (c.s0)); \

2473

ARM_DOT_K0((a), (b##1), (c.s1)); \

2474

ARM_DOT_K0((a), (b##2), (c.s2)); \

2475

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2476

#elif N0 == 4 // N0 == 4

2477

#define ARM_DOT_K0XN0(a, b, c) \

2478

({ \

2479

ARM_DOT_K0((a), (b##0), (c.s0)); \

2480

ARM_DOT_K0((a), (b##1), (c.s1)); \

2481

ARM_DOT_K0((a), (b##2), (c.s2)); \

2482

ARM_DOT_K0((a), (b##3), (c.s3)); \

2483

})

2484

#elif N0 == 8 // N0 == 8

2485

#define ARM_DOT_K0XN0(a, b, c) \

2486

({ \

2487

ARM_DOT_K0((a), (b##0), (c.s0)); \

2488

ARM_DOT_K0((a), (b##1), (c.s1)); \

2489

ARM_DOT_K0((a), (b##2), (c.s2)); \

2490

ARM_DOT_K0((a), (b##3), (c.s3)); \

2491

ARM_DOT_K0((a), (b##4), (c.s4)); \

2492

ARM_DOT_K0((a), (b##5), (c.s5)); \

2493

ARM_DOT_K0((a), (b##6), (c.s6)); \

2494

ARM_DOT_K0((a), (b##7), (c.s7)); \

2495

})

2496

#elif N0 == 16 // N0 == 16

2497

#define ARM_DOT_K0XN0(a, b, c) \

2498

({ \

2499

ARM_DOT_K0((a), (b##0), (c.s0)); \

2500

ARM_DOT_K0((a), (b##1), (c.s1)); \

2501

ARM_DOT_K0((a), (b##2), (c.s2)); \

2502

ARM_DOT_K0((a), (b##3), (c.s3)); \

2503

ARM_DOT_K0((a), (b##4), (c.s4)); \

2504

ARM_DOT_K0((a), (b##5), (c.s5)); \

2505

ARM_DOT_K0((a), (b##6), (c.s6)); \

2506

ARM_DOT_K0((a), (b##7), (c.s7)); \

2507

ARM_DOT_K0((a), (b##8), (c.s8)); \

2508

ARM_DOT_K0((a), (b##9), (c.s9)); \

2509

ARM_DOT_K0((a), (b##A), (c.sA)); \

2510

ARM_DOT_K0((a), (b##B), (c.sB)); \

2511

ARM_DOT_K0((a), (b##C), (c.sC)); \

2512

ARM_DOT_K0((a), (b##D), (c.sD)); \

2513

ARM_DOT_K0((a), (b##E), (c.sE)); \

2514

ARM_DOT_K0((a), (b##F), (c.sF)); \

2515

})

2516

#else // N0 not supported

2517

#error "N0 value not supported"

2518

#endif // N0 conditions

2519

2520

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2521

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2522

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2523

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2524

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2525

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2526

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2527

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2528

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2529

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2530

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2531

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2532

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2533

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2534

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2535

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2536

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2537

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2538

* - N0 = 2, 3, 4, 8, 16

2539

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2540

* - V0 >= 1

2541

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2542

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2543

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2544

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2545

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2546

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2547

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2548

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2549

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2550

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2551

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2552

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2553

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2554

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2555

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2556

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2557

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2558

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2559

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2560

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2561

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2562

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2563

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2564

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2565

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2566

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2567

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2568

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2569

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2574

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2575

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2576

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2577

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2578

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2579

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2580

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2581

*/

2582

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2583

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2584

#if defined(BETA)

2585

IMAGE_DECLARATION(bias),

2586

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2587

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2588

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2589

uint lhs_stride_z,

2590

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2591

#if defined(BETA)

2592

uint bias_stride_z,

2593

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2594

uint dst_stride_z

2595

#if defined(REINTERPRET_OUTPUT_AS_3D)

2596

,

2597

uint dst_cross_plane_pad

2598

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2603

2604

#if defined(LHS_INTERLEAVE)

2605

#define LHS_OFFSET_X (K0)

2606

#define LHS_STEP_X ((K0) * (V0))

2607

#define LHS_STEP_LOOP (1)

2608

#else // defined(INTERLEAVE)

2609

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2610

#define LHS_STEP_X (K0)

2611

#define LHS_STEP_LOOP (V0)

2612

#endif // defined(INTERLEAVE)

2613

2614

// Block size

2615

#define RHS_BLOCK_SIZE ((K0) * (N0))

2616

2617

// RHS offset and step X

2618

#if defined(RHS_INTERLEAVE)

2619

#define RHS_OFFSET_X (K0)

2620

#define RHS_STEP_X ((K0) * (H0))

2621

#define RHS_STEP_LOOP (1)

2622

#else // defined(RHS_INTERLEAVE)

2623

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2624

#define RHS_STEP_X (K0)

2625

#define RHS_STEP_LOOP (H0)

2626

#endif // defined(RHS_INTERLEAVE)

2627

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2628

#if defined(DUMMY_WORK_ITEMS)

2629

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2634

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2635

// Compute LHS matrix address

2636

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2637

(get_global_id(2) * lhs_stride_z);

2638

2639

// Compute RHS matrix address

2640

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2641

2642

#if defined(MATRIX_B_DEPTH)

2643

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2644

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2645

#else // defined(MATRIX_B_DEPTH)

2646

rhs_addr += get_global_id(2) * rhs_stride_z;

2647

#endif // defined(MATRIX_B_DEPTH)

2648

2649

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2650

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2651

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2652

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2653

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2654

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2655

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2656

{

2657

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2658

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2659

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2660

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2661

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2662

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2663

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2664

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2665

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2666

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2667

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2668

2669

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2670

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2671

2672

// Accumulate

2673

ARM_DOT_K0XN0(a0, b, c0);

2674

#if M0 > 1

2675

ARM_DOT_K0XN0(a1, b, c1);

2676

#endif // M0 > 1

2677

#if M0 > 2

2678

ARM_DOT_K0XN0(a2, b, c2);

2679

#endif // M0 > 2

2680

#if M0 > 3

2681

ARM_DOT_K0XN0(a3, b, c3);

2682

#endif // M0 > 3

2683

#if M0 > 4

2684

ARM_DOT_K0XN0(a4, b, c4);

2685

#endif // M0 > 4

2686

#if M0 > 5

2687

ARM_DOT_K0XN0(a5, b, c5);

2688

#endif // M0 > 5

2689

#if M0 > 6

2690

ARM_DOT_K0XN0(a6, b, c6);

2691

#endif // M0 > 6

2692

#if M0 > 7

2693

ARM_DOT_K0XN0(a7, b, c7);

2694

#endif // M0 > 7

2695

2696

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2697

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2698

}

2699

2700

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2701

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2702

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2703

2704

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2705

2706

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

2707

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2708

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2709

// multiply dst_stride_z by DEPTH_GEMM3D

2710

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2711

2712

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2713

2714

// Add offset for batched GEMM

2715

dst_addr += get_global_id(2) * dst_stride_z;

2716

2717

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2718

2719

// Multiply by the weight of matrix-matrix product and store the result

2720

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2721

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2722

#endif // defined(ALPHA)

2723

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2724

// Add beta*bias

2725

#if defined(BETA)

2726

#if defined(BROADCAST_BIAS)

2727

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2728

2729

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2730

2731

#ifndef UNIT_BETA

2732

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2733

#endif // UNIT_BIAS

2734

2735

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2736

#if defined(MIXED_PRECISION)

2737

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2738

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2739

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2740

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2741

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2742

2743

#else // defined(BROADCAST_BIAS)

2744

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2745

2) * bias_stride_z;

2746

2747

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2748

2749

#ifndef UNIT_BETA

2750

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2751

#endif // UNIT_BIAS

2752

2753

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2754

#if defined(MIXED_PRECISION)

2755

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2756

ADD_BLOCK(M0, c, bias_hp);

2757

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2758

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2759

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2760

2761

#endif // defined(BROADCAST_BIAS)

2762

#endif // defined(BETA)

2763

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2764

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2765

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2766

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2767

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2768

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2769

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2770

#endif // defined(ACTIVATION_TYPE)

2771

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2772

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

2773

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

2774

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2775

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2776

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2777

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2778

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2779

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2780

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2781

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2782

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2783

#undef LHS_BLOCK_SIZE

2784

#undef LHS_OFFSET_X

2785

#undef LHS_STEP_X

2786

#undef RHS_BLOCK_SIZE

2787

#undef RHS_OFFSET_X

2788

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2789

#undef LHS_STEP_LOOP

2790

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2791

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2792

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2793

#if defined(OPENCL_IMAGE_SUPPORT)

2794

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2795

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2796

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2797

*

2798

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2799

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2800

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2801

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2802

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2803

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2804

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2805

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2806

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2807

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2808

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2809

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2810

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2811

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2812

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2813

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2814

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2815

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2822

* The activation function is performed after the bias addition

2823

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2824

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2825

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2826

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2827

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2828

*

2829

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2830

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2831

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2832

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2833

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2834

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2835

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2836

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2837

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2838

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2839

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2840

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2841

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2842

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2843

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2844

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2845

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2846

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2847

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2848

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2849

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2850

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2851

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2852

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2853

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2854

*/

2855

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2856

__read_only image2d_t rhs_img,

2857

#if defined(BETA)

2858

IMAGE_DECLARATION(bias),

2859

#endif // defined(BETA)

2860

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2861

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2867

uint dst_stride_z

2868

#if defined(REINTERPRET_OUTPUT_AS_3D)

2869

,

2870

uint dst_cross_plane_pad

2871

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2876

2877

// Block size

2878

#define LHS_BLOCK_SIZE ((K0) * (M0))

2879

2880

#if defined(LHS_INTERLEAVE)

2881

#define LHS_OFFSET_X (K0)

2882

#define LHS_STEP_X ((K0) * (V0))

2883

#define LHS_STEP_LOOP (1)

2884

#else // defined(INTERLEAVE)

2885

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2886

#define LHS_STEP_X (K0)

2887

#define LHS_STEP_LOOP (V0)

2888

#endif // defined(INTERLEAVE)

2889

2890

// Block size

2891

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2892

2893

// RHS offset and step X

2894

#if defined(RHS_INTERLEAVE)

2895

#define RHS_OFFSET_X (PIXEL_UNIT)

2896

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2897

#define RHS_STEP_LOOP (1)

2898

#else // defined(RHS_INTERLEAVE)

2899

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2900

#define RHS_STEP_X PIXEL_UNIT

2901

#define RHS_STEP_LOOP (H0)

2902

#endif // defined(RHS_INTERLEAVE)

2903

2904

#if defined(DUMMY_WORK_ITEMS)

2905

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2910

2911

// Compute LHS matrix address

2912

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2913

(get_global_id(2) * lhs_stride_z);

2914

2915

#if defined(MATRIX_B_DEPTH)

2916

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2917

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2918

#else // defined(MATRIX_B_DEPTH)

2919

const uint z_rhs = get_global_id(2);

2920

#endif // defined(MATRIX_B_DEPTH)

2921

2922

// Compute RHS matrix coordinates

2923

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2924

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2925

2926

// Initialize the accumulators

2927

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2928

2929

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2930

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2931

2932

for(int i = 0; i < K; i += K0)

2933

{

2934

// Load values from LHS matrix

2935

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2936

2937

// Load values from RHS matrix stored in a cl_image

2938

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2939

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2940

2941

// Accumulate

2942

ARM_DOT_K0XN0(a0, b, c0);

2943

#if M0 > 1

2944

ARM_DOT_K0XN0(a1, b, c1);

2945

#endif // M0 > 1

2946

#if M0 > 2

2947

ARM_DOT_K0XN0(a2, b, c2);

2948

#endif // M0 > 2

2949

#if M0 > 3

2950

ARM_DOT_K0XN0(a3, b, c3);

2951

#endif // M0 > 3

2952

#if M0 > 4

2953

ARM_DOT_K0XN0(a4, b, c4);

2954

#endif // M0 > 4

2955

#if M0 > 5

2956

ARM_DOT_K0XN0(a5, b, c5);

2957

#endif // M0 > 5

2958

#if M0 > 6

2959

ARM_DOT_K0XN0(a6, b, c6);

2960

#endif // M0 > 6

2961

#if M0 > 7

2962

ARM_DOT_K0XN0(a7, b, c7);

2963

#endif // M0 > 7

2964

2965

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2966

2967

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2968

}

2969

2970

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2971

2972

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2973

2974

#if defined(REINTERPRET_OUTPUT_AS_3D)

2975

2976

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

2977

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2978

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2979

// multiply dst_stride_z by DEPTH_GEMM3D

2980

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2981

2982

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2983

2984

// Add offset for batched GEMM

2985

dst_addr += get_global_id(2) * dst_stride_z;

2986

2987

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2988

2989

// Multiply by the weight of matrix-matrix product and store the result

2990

#if defined(ALPHA)

2991

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2992

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2997

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2998

2999

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3000

3001

#ifndef UNIT_BETA

3002

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3003

#endif // UNIT_BIAS

3004

3005

// c = c + bias[broadcasted]

3006

#if defined(MIXED_PRECISION)

3007

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3008

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3009

#else // defined(MIXED_PRECISION)

3010

ADD_BLOCK_BROADCAST(M0, c, bias0);

3011

#endif // defined(MIXED_PRECISION)

3012

3013

#else // defined(BROADCAST_BIAS)

3014

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3015

2) * bias_stride_z;

3016

3017

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3018

3019

#ifndef UNIT_BETA

3020

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3025

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3026

ADD_BLOCK(M0, c, bias_hp);

3027

#else // defined(MIXED_PRECISION)

3028

ADD_BLOCK(M0, c, bias);

3029

#endif // defined(MIXED_PRECISION)

3030

3031

#endif // defined(BROADCAST_BIAS)

3032

#endif // defined(BETA)

3033

3034

#if defined(ACTIVATION_TYPE)

3035

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3036

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3037

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3038

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3039

#endif // defined(MIXED_PRECISION)

3040

#endif // defined(ACTIVATION_TYPE)

3041

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3042

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3043

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3044

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3045

// Store output block

3046

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3047

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3048

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3049

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3050

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3051

#endif // defined(MIXED_PRECISION)

3052

3053

#undef LHS_BLOCK_SIZE

3054

#undef LHS_OFFSET_X

3055

#undef LHS_STEP_X

3056

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3064

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3065

#if defined(LHS_TRANSPOSE)

3066

3067

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3068

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3069

#if defined(MIXED_PRECISION)

3070

3071

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3072

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3073

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3074

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3075

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3076

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3077

#else // defined(MIXED_PRECISION

3078

3079

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3080

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3081

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3082

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3083

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3084

3085

#endif // defined(MIXED_PRECISION)

3086

3087

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3088

({ \

3089

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3090

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3091

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3092

({ \

3093

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3094

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3095

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3096

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3097

({ \

3098

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3099

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3100

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3101

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3102

({ \

3103

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3104

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3105

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3106

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3107

({ \

3108

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3109

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3110

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3111

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3112

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3113

})

3114

3115

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3116

// a is the column-vector (transposed)

3117

// b is the row-vector (not transposed)

3118

// C is the output matrix

3119

// Lower case is a vector (a, b)

3120

// Upper case is a matrix (C)

3121

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3122

3123

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3124

({ \

3125

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3126

})

3127

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3128

({ \

3129

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3130

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3131

})

3132

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3133

({ \

3134

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3135

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3136

})

3137

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3138

({ \

3139

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3140

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3141

})

3142

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3143

({ \

3144

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3145

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3146

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3147

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3148

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3149

})

3150

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3151

({ \

3152

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3153

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3154

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3155

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3156

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3157

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3158

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3159

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3160

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3161

})

3162

3163

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3164

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3165

// The dimensions supported are:

3166

// M0: 1, 2, 3, 4, 8

3167

// N0: 1, 2, 3, 4, 8, 16

3168

// K0: 1, 2, 3, 4, 8, 16

3169

// This macro calls the vector-by-matrix macro K0 times

3170

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3171

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3172

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3173

(M0, N0, TYPE, A, B, C)

3174

3175

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3176

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3177

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3178

*

3179

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3180

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3181

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3182

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3183

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3184

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3185

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3186

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3187

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3188

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3189

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3190

* - M0 = 2, 3, 4, 8

3191

* - N0 = 2, 3, 4, 8, 16

3192

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3197

* The activation function is performed after the bias addition

3198

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3199

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3200

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3201

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3202

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3203

*

3204

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3205

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3206

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3207

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3208

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3209

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3210

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3211

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3212

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3213

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3214

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3215

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3216

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3217

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3218

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3219

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3220

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3221

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3222

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3223

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3224

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3225

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3226

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3228

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3229

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3230

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3231

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3232

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3233

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3234

*/

3235

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3236

IMAGE_DECLARATION(rhs),

3237

#if defined(BETA)

3238

IMAGE_DECLARATION(bias),

3239

#endif // defined(BETA)

3240

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3241

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3247

uint dst_stride_z

3248

#if defined(REINTERPRET_OUTPUT_AS_3D)

3249

,

3250

uint dst_cross_plane_pad

3251

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3256

3257

#if defined(LHS_INTERLEAVE)

3258

#define LHS_OFFSET_X (M0)

3259

#define LHS_STEP_X ((M0) * (V0))

3260

#define LHS_STEP_LOOP (1)

3261

#else // defined(INTERLEAVE)

3262

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3263

#define LHS_STEP_X (M0)

3264

#define LHS_STEP_LOOP (V0)

3265

#endif // defined(INTERLEAVE)

3266

3267

// Block size

3268

#define RHS_BLOCK_SIZE ((K0) * (N0))

3269

3270

// RHS offset and step X

3271

#if defined(RHS_INTERLEAVE)

3272

#define RHS_OFFSET_X (N0)

3273

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3274

#else // defined(RHS_INTERLEAVE)

3275

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3276

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3277

#endif // defined(RHS_INTERLEAVE)

3278

3279

const uint x = get_global_id(0);

3280

const uint y = get_global_id(1);

3281

const uint z = get_global_id(2);

3282

3283

#if defined(DUMMY_WORK_ITEMS)

3284

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3289

3290

// Compute LHS matrix address

3291

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3292

3293

// Compute RHS matrix address

3294

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3295

3296

#if defined(MATRIX_B_DEPTH)

3297

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3298

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3299

#else // defined(MATRIX_B_DEPTH)

3300

rhs_addr += z * rhs_stride_z;

3301

#endif // defined(MATRIX_B_DEPTH)

3302

3303

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3304

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3305

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3306

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3307

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3308

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3309

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3310

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3311

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3312

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3313

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3314

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3315

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3316

b0;

3317

3318

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3319

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3320

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3321

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3322

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3323

lhs += LHS_STEP_X;

3324

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3325

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3326

#if K0 > 1

3327

a0 = VLOAD(M0)(0, lhs);

3328

b0 = VLOAD(N0)(0, rhs);

3329

3330

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3338

b0 = VLOAD(N0)(0, rhs);

3339

3340

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3348

b0 = VLOAD(N0)(0, rhs);

3349

3350

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3358

b0 = VLOAD(N0)(0, rhs);

3359

3360

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3366

b0 = VLOAD(N0)(0, rhs);

3367

3368

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3374

b0 = VLOAD(N0)(0, rhs);

3375

3376

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3382

b0 = VLOAD(N0)(0, rhs);

3383

3384

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3392

b0 = VLOAD(N0)(0, rhs);

3393

3394

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3400

b0 = VLOAD(N0)(0, rhs);

3401

3402

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3408

b0 = VLOAD(N0)(0, rhs);

3409

3410

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3416

b0 = VLOAD(N0)(0, rhs);

3417

3418

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3424

b0 = VLOAD(N0)(0, rhs);

3425

3426

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3432

b0 = VLOAD(N0)(0, rhs);

3433

3434

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3440

b0 = VLOAD(N0)(0, rhs);

3441

3442

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3448

b0 = VLOAD(N0)(0, rhs);

3449

3450

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3457

lhs += (M0 * K0 * (V0 - 1));

3458

#endif // LHS_INTERLEAVE

3459

3460

#ifndef RHS_INTERLEAVE

3461

rhs += (N0 * K0 * (H0 - 1));

3462

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3463

}

3464

3465

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3466

3467

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3468

3469

#if defined(REINTERPRET_OUTPUT_AS_3D)

3470

3471

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

3472

CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3473

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3474

// multiply dst_stride_z by DEPTH_GEMM3D

3475

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3476

3477

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3478

3479

// Add offset for batched GEMM

3480

dst_addr += z * dst_stride_z;

3481

3482

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3483

3484

// Multiply by the weight of matrix-matrix product and store the result

3485

#if defined(ALPHA)

3486

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3487

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3492

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3493

3494

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3495

3496

#ifndef UNIT_BETA

3497

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3498

#endif // UNIT_BIAS

3499

3500

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3501

#if defined(MIXED_PRECISION)

3502

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3503

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3504

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3505

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3506

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3507

3508

#else // defined(BROADCAST_BIAS)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3509

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3510

2) * bias_stride_z;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3511

3512

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3513

3514

#ifndef UNIT_BETA

3515

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3516

#endif // UNIT_BIAS

3517

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3518

#if defined(MIXED_PRECISION)

3519

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3520

ADD_BLOCK(M0, c, bias_hp);

3521

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3522

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3523

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3524

3525

#endif // defined(BROADCAST_BIAS)

3526

#endif // defined(BETA)

3527

3528

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3529

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3530

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3531

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3532

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3533

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3534

#endif // defined(ACTIVATION_TYPE)

3535

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3536

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3537

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3538

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3539

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3540

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3541

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3542

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3543

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3544

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3545

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3546

3547

#undef LHS_BLOCK_SIZE

3548

#undef LHS_OFFSET_X

3549

#undef LHS_STEP_X

3550

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3555

#if defined(OPENCL_IMAGE_SUPPORT)

3556

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3557

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3558

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3559

*

3560

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3561

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3562

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3563

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3564

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3565

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3566

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3567

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3568

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3569

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3570

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3571

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3572

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3573

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3574

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3582

* The activation function is performed after the bias addition

3583

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3584

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3585

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3586

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3587

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3588

*

3589

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3590

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3591

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3592

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3593

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3594

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3595

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3596

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3597

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3598

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3599

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3600

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3601

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3602

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3603

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3604

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3605

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3606

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3607

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3608

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3609

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3610

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3611

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3612

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3613

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3614

*/

3615

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3616

__read_only image2d_t rhs_img,

3617

#if defined(BETA)

3618

IMAGE_DECLARATION(bias),

3619

#endif // defined(BETA)

3620

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3621

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3627

uint dst_stride_z

3628

#if defined(REINTERPRET_OUTPUT_AS_3D)

3629

,

3630

uint dst_cross_plane_pad

3631

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3636

3637

// Block size

3638

#define LHS_BLOCK_SIZE ((K0) * (M0))

3639

3640

#if defined(LHS_INTERLEAVE)

3641

#define LHS_OFFSET_X (M0)

3642

#define LHS_STEP_X ((M0) * (V0))

3643

#define LHS_STEP_LOOP (1)

3644

#else // defined(INTERLEAVE)

3645

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3646

#define LHS_STEP_X (M0)

3647

#define LHS_STEP_LOOP (V0)

3648

#endif // defined(INTERLEAVE)

3649

3650

// Block size

3651

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3652

3653

// RHS offset and step X

3654

#if defined(RHS_INTERLEAVE)

3655

#define RHS_OFFSET_X (PIXEL_UNIT)

3656

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3657

#else // defined(RHS_INTERLEAVE)

3658

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3659

#define RHS_STEP_X (PIXEL_UNIT)

3660

#endif // defined(RHS_INTERLEAVE)

3661

3662

const uint x = get_global_id(0);

3663

const uint y = get_global_id(1);

3664

const uint z = get_global_id(2);

3665

3666

#if defined(DUMMY_WORK_ITEMS)

3667

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3672

3673

// Compute LHS matrix address

3674

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3675

3676

#if defined(MATRIX_B_DEPTH)

3677

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3678

const uint z_rhs = (z % MATRIX_B_DEPTH);

3679

#else // defined(MATRIX_B_DEPTH)

3680

const uint z_rhs = z;

3681

#endif // defined(MATRIX_B_DEPTH)

3682

3683

// Compute RHS matrix coordinates

3684

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3685

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3686

3687

// Initialize the accumulators

3688

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3689

3690

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3691

3692

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3693

3694

for(int i = 0; i < K; i += K0)

3695

{

3696

VEC_DATA_TYPE(DATA_TYPE, M0)

3697

a0;

3698

VEC_DATA_TYPE(DATA_TYPE, N0)

3699

b0;

3700

3701

a0 = VLOAD(M0)(0, lhs);

3702

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3703

3704

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3710

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3711

3712

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3719

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3720

3721

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3728

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3729

3730

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3737

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3738

3739

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3744

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3745

3746

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3751

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3752

3753

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3758

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3759

3760

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3767

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3768

3769

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3774

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3775

3776

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3781

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3782

3783

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3788

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3789

3790

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3795

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3796

3797

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3802

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3803

3804

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3809

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3810

3811

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3816

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3817

3818

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3824

lhs += (M0 * K0 * (V0 - 1));

3825

#endif // LHS_INTERLEAVE

3826

3827

x_rhs += K0 * RHS_STEP_X;

3828

#ifndef RHS_INTERLEAVE

3829

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3830

#endif // RHS_INTERLEAVE

3831

}

3832

3833

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3834

3835

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3836

3837

#if defined(REINTERPRET_OUTPUT_AS_3D)

3838

3839

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

3840

CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3841

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3842

// multiply dst_stride_z by DEPTH_GEMM3D

3843

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3844

3845

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3846

3847

// Add offset for batched GEMM

3848

dst_addr += z * dst_stride_z;

3849

3850

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3851

3852

// Multiply by the weight of matrix-matrix product and store the result

3853

#if defined(ALPHA)

3854

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3855

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3860

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3861

3862

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3863

3864

#ifndef UNIT_BETA

3865

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3866

#endif // UNIT_BIAS

3867

3868

// c = c + bias[broadcasted]

3869

#if defined(MIXED_PRECISION)

3870

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3871

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3872

#else // defined(MIXED_PRECISION)

3873

ADD_BLOCK_BROADCAST(M0, c, bias0);

3874

#endif // defined(MIXED_PRECISION)

3875

3876

#else // defined(BROADCAST_BIAS)

3877

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3878

3879

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3880

3881

#ifndef UNIT_BETA

3882

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3883

#endif // UNIT_BIAS

3884

3885

#if defined(MIXED_PRECISION)

3886

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3887

ADD_BLOCK(M0, c, bias_hp);

3888

#else // defined(MIXED_PRECISION)

3889

ADD_BLOCK(M0, c, bias);

3890

#endif // defined(MIXED_PRECISION)

3891

3892

#endif // defined(BROADCAST_BIAS)

3893

#endif // defined(BETA)

3894

3895

#if defined(ACTIVATION_TYPE)

3896

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3897

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3898

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3899

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3900

#endif // defined(MIXED_PRECISION)

3901

#endif // defined(ACTIVATION_TYPE)

3902

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3903

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3904

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3905

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3906

// Store output block

3907

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3908

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3909

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3910

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3911

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3912

#endif // defined(MIXED_PRECISION)

3913

3914

#undef LHS_BLOCK_SIZE

3915

#undef LHS_OFFSET_X

3916

#undef LHS_STEP_X

3917

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3925

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3926

#endif // defined(LHS_TRANSPOSE)

3927

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3928

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3929

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3930

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3931

3932

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3939

({ \

3940

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3941

})

3942

#elif M0 == 2 // M0 == 2

3943

#define RHS_VFMA_M0xN0(i, a, b, c) \

3944

({ \

3945

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3946

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3947

})

3948

#elif M0 == 3 // M0 == 3

3949

#define RHS_VFMA_M0xN0(i, a, b, c) \

3950

({ \

3951

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3952

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3953

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3954

})

3955

#elif M0 == 4 // M0 == 4

3956

#define RHS_VFMA_M0xN0(i, a, b, c) \

3957

({ \

3958

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3959

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3960

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3961

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3962

})

3963

#elif M0 == 5 // M0 == 5

3964

#define RHS_VFMA_M0xN0(i, a, b, c) \

3965

({ \

3966

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3967

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3968

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3969

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3970

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3971

})

3972

#elif M0 == 6 // M0 == 6

3973

#define RHS_VFMA_M0xN0(i, a, b, c) \

3974

({ \

3975

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3976

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3977

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3978

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3979

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3980

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3981

})

3982

#elif M0 == 7 // M0 == 7

3983

#define RHS_VFMA_M0xN0(i, a, b, c) \

3984

({ \

3985

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3986

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3987

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3988

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3989

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3990

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3991

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3992

})

3993

#elif M0 == 8 // M0 == 8

3994

#define RHS_VFMA_M0xN0(i, a, b, c) \

3995

({ \

3996

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3997

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3998

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3999

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

4000

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

4001

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

4002

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

4003

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

4004

})

4005

#else // M0 not supported

4006

#error "M0 not supported"

4007

#endif // M0 not supported

4008

4009

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

4010

* The LHS matrix is NOT reshaped

4011

* The RHS matrix is NOT reshaped

4012

*

4013

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4014

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

4015

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

4016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

4017

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

4018

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

4019

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

4020

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4021

* @note Only the following configurations of M0, N0 and K0 are currently supported:

4022

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

4023

* - N0 = 2, 3, 4, 8, 16

4024

* - K0 = 2, 3, 4, 8, 16

4025

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4026

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4027

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4028

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4029

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

4030

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4031

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4032

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4033

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

4034

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4035

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

4036

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4037

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4038

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4039

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4040

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4041

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4042

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4043

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4044

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4045

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4046

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4047

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4048

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4049

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4050

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4051

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4052

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4053

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4054

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4055

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4056

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4057

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4058

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4059

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4060

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4061

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4062

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4063

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4064

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4065

*/

4066

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4067

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4068

#if defined(BETA)

4069

IMAGE_DECLARATION(bias),

4070

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4071

IMAGE_DECLARATION(dst),

4072

uint lhs_stride_z,

4073

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4074

#if defined(BETA)

4075

uint bias_stride_z,

4076

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4077

uint dst_stride_z

4078

#if defined(REINTERPRET_INPUT_AS_3D)

4079

,

4080

uint lhs_cross_plane_pad

4081

#endif // REINTERPRET_INPUT_AS_3D

4082

#if defined(REINTERPRET_OUTPUT_AS_3D)

4083

,

4084

uint dst_cross_plane_pad

4085

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4090

4091

// RHS offset and step X

4092

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4093

4094

uint x = get_global_id(0);

4095

uint y = get_global_id(1);

4096

uint z = get_global_id(2);

4097

4098

#if defined(DUMMY_WORK_ITEMS)

4099

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4104

4105

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4106

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4107

4108

// Compute RHS matrix address

4109

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4110

4111

#if defined(MATRIX_B_DEPTH)

4112

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4113

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4114

#else // defined(MATRIX_B_DEPTH)

4115

rhs_offset += z * rhs_stride_z;

4116

#endif // defined(MATRIX_B_DEPTH)

4117

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4118

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4119

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4120

4121

#if defined(REINTERPRET_INPUT_AS_3D)

4122

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

4123

CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4124

4125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4126

// multiply lhs_stride_z by DEPTH_GEMM3D

4127

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4128

4129

#else // defined(REINTERPRET_INPUT_AS_3D)

4130

4131

// Add offset for batched GEMM

4132

lhs_offset += z * lhs_stride_z;

4133

4134

#endif // defined(REINTERPRET_INPUT_AS_3D)

4135

4136

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4137

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4138

4139

int i = 0;

4140

for(; i <= (K - K0); i += K0)

4141

{

4142

// Supported cases (M0, K0):

4143

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4144

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4145

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4146

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4147

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4148

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4149

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4150

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4151

// Load values from LHS matrix

4152

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4153

4154

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4155

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4156

4157

RHS_VFMA_M0xN0(0, a, b0, c);

4158

RHS_VFMA_M0xN0(1, a, b1, c);

4159

#if K0 > 2

4160

RHS_VFMA_M0xN0(2, a, b2, c);

4161

#endif // K0 > 2

4162

#if K0 > 3

4163

RHS_VFMA_M0xN0(3, a, b3, c);

4164

#endif // K0 > 3

4165

#if K0 > 4

4166

RHS_VFMA_M0xN0(4, a, b4, c);

4167

RHS_VFMA_M0xN0(5, a, b5, c);

4168

RHS_VFMA_M0xN0(6, a, b6, c);

4169

RHS_VFMA_M0xN0(7, a, b7, c);

4170

#endif // K0 > 4

4171

#if K0 > 8

4172

RHS_VFMA_M0xN0(8, a, b8, c);

4173

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4174

RHS_VFMA_M0xN0(A, a, bA, c);

4175

RHS_VFMA_M0xN0(B, a, bB, c);

4176

RHS_VFMA_M0xN0(C, a, bC, c);

4177

RHS_VFMA_M0xN0(D, a, bD, c);

4178

RHS_VFMA_M0xN0(E, a, bE, c);

4179

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4180

#endif // K0 > 8

4181

4182

lhs_offset += K0 * sizeof(DATA_TYPE);

4183

rhs_offset += K0 * rhs_stride_y;

4184

}

4185

4186

// Left-over accumulations

4187

for(; i < K; ++i)

4188

{

4189

// Load values from LHS matrix

4190

VEC_DATA_TYPE(DATA_TYPE, 2)

4191

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4192

#if M0 > 1

4193

VEC_DATA_TYPE(DATA_TYPE, 2)

4194

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4195

#endif // M0 > 1

4196

#if M0 > 2

4197

VEC_DATA_TYPE(DATA_TYPE, 2)

4198

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4199

#endif // M0 > 2

4200

#if M0 > 3

4201

VEC_DATA_TYPE(DATA_TYPE, 2)

4202

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4203

#endif // M0 > 3

4204

#if M0 > 4

4205

VEC_DATA_TYPE(DATA_TYPE, 2)

4206

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4207

#endif // M0 > 4

4208

#if M0 > 5

4209

VEC_DATA_TYPE(DATA_TYPE, 2)

4210

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4211

#endif // M0 > 5

4212

#if M0 > 6

4213

VEC_DATA_TYPE(DATA_TYPE, 2)

4214

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4215

#endif // M0 > 6

4216

#if M0 > 7

4217

VEC_DATA_TYPE(DATA_TYPE, 2)

4218

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4219

#endif // M0 > 7

4220

4221

VEC_DATA_TYPE(DATA_TYPE, N0)

4222

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4223

RHS_VFMA_M0xN0(0, a, b, c);

4224

4225

lhs_offset += sizeof(DATA_TYPE);

4226

rhs_offset += rhs_stride_y;

4227

}

4228

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4229

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4230

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4231

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4232

4233

#if defined(REINTERPRET_OUTPUT_AS_3D)

4234

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2020-10-22 16:37:12 +0100

[diff] [blame]

4235

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4236

4237

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4238

// multiply dst_stride_z by DEPTH_GEMM3D

4239

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4240

4241

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4242

4243

// Add offset for batched GEMM

4244

dst_addr += z * dst_stride_z;

4245

4246

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4247

4248

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4249

#if defined(ALPHA)

4250

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4251

#endif // defined(ALPHA)

4252

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4253

// Add beta*bias

4254

#if defined(BETA)

4255

#if defined(BROADCAST_BIAS)

4256

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4257

4258

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4259

4260

#ifndef UNIT_BETA

4261

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4262

#endif // UNIT_BIAS

4263

4264

// c = c + bias[broadcasted]

4265

ADD_BLOCK_BROADCAST(M0, c, bias0);

4266

4267

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4268

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4269

4270

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4271

4272

#ifndef UNIT_BETA

4273

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4278

4279

#endif // defined(BROADCAST_BIAS)

4280

#endif // defined(BETA)

4281

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4282

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

4283

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4284

#endif // defined(ACTIVATION_TYPE)

4285

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

4286

const bool cond_y = y == 0;

4287

const bool cond_x = ((x + 1) * N0 >= N);

4288

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4289

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

4290

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4291

4292

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4297

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4298

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4299

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

4300

*

Gian Marco

19835e5

2018-01-30 13:35:54 +0000

[diff] [blame]

4301

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4302

*

4303

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

4304

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

4305

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4306

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

4307

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4308

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

4309

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4310

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

4311

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4312

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4313

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4314

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4315

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4316

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4317

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4318

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4319

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4320

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

4321

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4322

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4323

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4324

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

4325

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4326

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4327

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4328

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

4329

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4330

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4331

float4 c = vload4(0, (__global float *)src.ptr);

4332

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4333

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4334

float4 out = alpha_ab + (float4)BETA * c;

4335

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4336

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4337

vstore4(out, 0, (__global float *)dst.ptr);

4338

}

4339

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

4340

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4341

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

4342

*

Gian Marco

19835e5

2018-01-30 13:35:54 +0000

[diff] [blame]

4343

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

4344

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4345

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

4346

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

4347

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4348

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

4349

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4350

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

4351

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4352

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

4353

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4354

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4355

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4356

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4357

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4358

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4359

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4360

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4361

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4362

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

4363

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4364

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4365

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4366

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

4367

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4368

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4369

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4370

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

4371

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4372

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4373

half8 c = vload8(0, (__global half *)src.ptr);

4374

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4375

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4376

half8 out = alpha_ab + (half8)BETA * c;

4377

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4378

// Store final result in axb matrix

Anthony Barbier