Blame - src/core/CL/cl_kernels/common/gemm_utils.cl - ml/ComputeLibrary

#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)

30

#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)

31

#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)

32

#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)

33

#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)

34

#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)

35

#define CONCAT_INC(K0) INC##K0

36

#define INC(K0) CONCAT_INC(K0)

37

38

#if(SRC_WIDTH % K0)

39

#define BOUNDARY_CONDITION_X(x, a) \

40

({ \

41

a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \

42

})

43

#else // (SRC_WIDTH % K0)

44

#define BOUNDARY_CONDITION_X(x, a) \

45

({})

46

#endif // (SRC_WIDTH % K0)

47

48

#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \

49

({ \

50

if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \

51

{ \

52

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

53

{ \

54

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

else \

{ \

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

64

{ \

65

LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

})

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in

75

* the output matrix unrolling the values.

76

*

77

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

78

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

79

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

80

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

81

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

82

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

83

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

84

* @note Only the following values for M0, K0 and V0 are supported:

* M0: 2,3,4,5,6,7,8

* K0: 2,3,4,8,16

* V0: greater than 0

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

89

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

90

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

91

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

92

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

93

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

94

*

95

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

96

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

97

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

98

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

99

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

100

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

101

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

102

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

103

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

104

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

105

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

106

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

107

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

108

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

109

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

110

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

111

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

112

*/

113

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

114

TENSOR3D_DECLARATION(dst)

115

#if defined(REINTERPRET_INPUT_AS_3D)

116

,

117

uint cross_plane_pad

118

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

123

124

// Output offset X

125

#if defined(INTERLEAVE)

126

#define OUTPUT_OFFSET_X (K0)

127

#else // defined(INTERLEAVE)

128

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

129

#endif // defined(INTERLEAVE)

130

131

// Output step X

132

#if defined(INTERLEAVE)

133

#define OUTPUT_STEP_X (K0) * (V0)

134

#else // Do not interleave

135

#define OUTPUT_STEP_X (K0)

136

#endif // defined(INTERLEAVE)

137

138

// Compute source and destination addresses

139

uint x = get_global_id(0);

140

uint y = get_global_id(1);

141

uint z = get_global_id(2);

142

143

// ------------------ Compute input/output addresses ---------------------------

144

145

// Compute the input address

146

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

147

148

// Compute the output address

149

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

150

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

151

152

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

153

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

154

155

#if defined(REINTERPRET_INPUT_AS_3D)

156

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

157

// multiply src_stride_z by DEPTH_GEMM3D

158

159

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

160

161

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

162

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

163

164

#else // defined(REINTERPRET_INPUT_AS_3D)

165

166

input_ptr += z * (uint)src_stride_z;

167

168

#endif // defined(REINTERPRET_INPUT_AS_3D)

169

170

// Add offset for batched GEMM

171

output_ptr += z * (uint)dst_stride_z;

172

173

// ---------------------------Load input values --------------------------------

174

// Load values from the LHS matrix

175

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

176

177

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

178

179

// ---------------------------Store output values ------------------------------

180

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

181

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

182

183

#undef BLOCK_SIZE

184

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if M0 == 2

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

190

({ \

191

VEC_DATA_TYPE(DATA_TYPE, M0) \

192

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

193

VSTORE(M0) \

194

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

195

})

196

#elif M0 == 3 // M0 == 3

197

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

198

({ \

199

VEC_DATA_TYPE(DATA_TYPE, M0) \

200

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

201

VSTORE(M0) \

202

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

203

})

204

#elif M0 == 4 // M0 == 4

205

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

206

({ \

207

VEC_DATA_TYPE(DATA_TYPE, M0) \

208

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

209

VSTORE(M0) \

210

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

211

})

212

#elif M0 == 5 // M0 == 5

213

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

214

({ \

215

VEC_DATA_TYPE(DATA_TYPE, 4) \

216

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

217

DATA_TYPE res1 = a4.s##i; \

218

VSTORE(4) \

219

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

220

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

221

})

222

#elif M0 == 6 // M0 == 6

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 2) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(2) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 7 // M0 == 7

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, 4) \

238

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

239

VEC_DATA_TYPE(DATA_TYPE, 3) \

240

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

241

VSTORE(4) \

242

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

243

VSTORE(3) \

244

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

245

})

246

#elif M0 == 8 // M0 == 8

247

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

248

({ \

249

VEC_DATA_TYPE(DATA_TYPE, M0) \

250

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

251

VSTORE(M0) \

252

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

253

})

254

#else // M0 not supported

255

#error "M0 value not supported"

256

#endif // N0 conditions

257

258

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

259

* the output matrix unrolling the values.

260

*

261

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

262

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

263

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

264

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

265

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

266

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

267

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

268

* @note Only the following values for M0, K0 and V0 are supported:

* M0: 2,3,4,5,6,7,8

* K0: 2,3,4,8,16

* V0: greater than 0

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

273

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

274

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

275

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

276

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

277

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

278

*

279

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

280

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

281

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

282

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

283

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

284

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

285

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

286

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

287

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

288

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

289

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

290

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

291

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

292

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

293

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

294

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

295

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

296

*/

297

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

298

TENSOR3D_DECLARATION(dst)

299

#if defined(REINTERPRET_INPUT_AS_3D)

300

,

301

uint cross_plane_pad

302

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

307

308

// Output offset X

309

#if defined(INTERLEAVE)

310

#define OUTPUT_OFFSET_X (M0)

311

#else // defined(INTERLEAVE)

312

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

313

#endif // defined(INTERLEAVE)

314

315

// Output step X

316

#if defined(INTERLEAVE)

317

#define OUTPUT_STEP_X (M0) * (V0)

318

#else // Do not interleave

319

#define OUTPUT_STEP_X (M0)

320

#endif // defined(INTERLEAVE)

321

322

// Compute source and destination addresses

323

uint x = get_global_id(0);

324

uint y = get_global_id(1);

325

uint z = get_global_id(2);

326

327

// ------------------ Compute input/output addresses ---------------------------

328

329

// Compute the input address

330

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

331

332

// Compute the output address

333

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

334

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

335

336

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

337

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

338

339

#if defined(REINTERPRET_INPUT_AS_3D)

340

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

341

// multiply src_stride_z by DEPTH_GEMM3D

342

343

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

344

345

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

346

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

347

348

#else // defined(REINTERPRET_INPUT_AS_3D)

349

350

input_ptr += z * (uint)src_stride_z;

351

352

#endif // defined(REINTERPRET_INPUT_AS_3D)

353

354

// Add offset for batched GEMM

355

output_ptr += z * (uint)dst_stride_z;

356

357

// ---------------------------Load input values --------------------------------

358

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

359

360

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

361

362

// ---------------------------Transpose and store block -----------------------

363

364

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

365

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

366

#if K0 > 2

367

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

368

#endif // K0 > 2

369

#if K0 > 3

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

371

#endif // K0 > 3

372

#if K0 > 4

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

374

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

375

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

377

#endif // K0 > 4

378

#if K0 > 8

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

383

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

384

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

391

#undef OUTPUT_STEP_X

392

}

393

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)

394

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

395

#if defined(RESHAPE_RHS_NT)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

396

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

397

* the output matrix unrolling the values.

398

*

399

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

400

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

401

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

402

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

403

* @note Only the following values for K0, N0 and H0 are supported:

* N0: 2,3,4,8,16

* K0: 1,2,3,4,8,16

* H0: greater than 0

*

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

408

* @param[in] src_ptr Pointer to the source tensor. Supported data types: All

409

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

410

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

411

* @param[in] src_w The size of the width dimension of the source tensor

412

* @param[in] src_h The size of the height dimension of the source tensor

413

* @param[in] src_n The size of the depth dimension of the source tensor

414

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

415

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All

416

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

417

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

418

* @param[in] dst_w The size of the width dimension of the destination tensor

419

* @param[in] dst_h The size of the height dimension of the destination tensor

420

* @param[in] dst_n The size of the depth dimension of the destination tensor

421

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

422

* @param[in] H0 The number of blocks to place on the same row. It must be greater than 0.

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

423

*/

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

424

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),

425

TENSOR3D_T(dst, BUFFER),

426

const int H0)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

427

{

428

// Block size

429

#define BLOCK_SIZE ((K0) * (N0))

430

431

// Output offset X

432

#if defined(INTERLEAVE)

433

#define OUTPUT_OFFSET_X (N0)

434

#else // defined(INTERLEAVE)

435

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

436

#endif // defined(INTERLEAVE)

437

438

// Output step X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_STEP_X (N0) * (H0)

441

#else // Do not interleave

442

#define OUTPUT_STEP_X (N0)

443

#endif // defined(INTERLEAVE)

444

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

445

const int x = GET_SPATIAL_IDX(0, 1, 0);

446

const int y = GET_SPATIAL_IDX(1, 1, 0);

447

const int z = GET_SPATIAL_IDX(2, 1, 0);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

448

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

449

const int xi = x * N0;

450

const int yi = y * K0;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

451

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

452

const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;

453

const int yo = (x / H0);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

454

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

455

src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;

456

dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

457

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

458

TILE(DATA_TYPE, K0, N0, in);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

459

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

460

// Initialize the tile to zero

461

for(int i = 0; i < K0; ++i)

462

{

463

in[i].v = 0;

464

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

465

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

466

// Load input tile

467

for(int i = 0; i < K0; ++i)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

468

{

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

469

if(yi + i < src_h)

470

{

471

in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);

472

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

473

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

474

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

475

TILE(uint, K0, 1, dst_indirect_y);

476

for(int i = 0; i < K0; ++i)

477

{

478

dst_indirect_y[i].v = i;

479

}

480

481

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

482

483

#undef BLOCK_SIZE

484

#undef OUTPUT_OFFSET_X

485

#undef OUTPUT_STEP_X

486

}

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

487

#endif // defined(RESHAPE_RHS_NT)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

488

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

489

#if defined(RESHAPE_RHS_T)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

490

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

491

* the output matrix unrolling the values.

492

*

493

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

494

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

495

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

496

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

497

* @note The option -DTRANSPOSE must passed at compile time.

498

* @note Only the following values for K0, N0 and H0 are supported:

* N0: 2,3,4,8,16

* K0: 2,3,4,8,16

* H0: greater than 0

*

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

503

* @param[in] src_ptr Pointer to the source tensor. Supported data types: All

504

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

505

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

506

* @param[in] src_w The size of the width dimension of the source tensor

507

* @param[in] src_h The size of the height dimension of the source tensor

508

* @param[in] src_n The size of the depth dimension of the source tensor

509

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

510

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: All

511

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

512

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

513

* @param[in] dst_w The size of the width dimension of the destination tensor

514

* @param[in] dst_h The size of the height dimension of the destination tensor

515

* @param[in] dst_n The size of the depth dimension of the destination tensor

516

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

517

* @param[in] H0 The number of blocks to place on the same row. It must be greater than 0.

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

518

*/

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

519

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),

520

TENSOR3D_T(dst, BUFFER),

521

const int H0)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

522

{

523

// Block size

524

#define BLOCK_SIZE ((K0) * (N0))

525

526

// Output offset X

527

#if defined(INTERLEAVE)

528

#define OUTPUT_OFFSET_X (K0)

529

#else // defined(INTERLEAVE)

530

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

531

#endif // defined(INTERLEAVE)

532

533

// Output step X

534

#if defined(INTERLEAVE)

535

#define OUTPUT_STEP_X (K0) * (H0)

536

#else // Do not interleave

537

#define OUTPUT_STEP_X (K0)

538

#endif // defined(INTERLEAVE)

539

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

540

const int x = GET_SPATIAL_IDX(0, 1, 0);

541

const int y = GET_SPATIAL_IDX(1, 1, 0);

542

const int z = GET_SPATIAL_IDX(2, 1, 0);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

543

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

544

const int xi = x * N0;

545

const int yi = y * K0;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

546

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

547

const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;

548

const int yo = (x / H0);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

549

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

550

src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;

551

dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

552

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

553

TILE(DATA_TYPE, K0, N0, in);

554

TILE(DATA_TYPE, N0, K0, in_tr);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

555

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

556

// Initialize the tile to zero

557

for(int i = 0; i < K0; ++i)

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

558

{

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

559

in[i].v = 0;

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

560

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

561

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

562

// Load input tile

563

for(int i = 0; i < K0; ++i)

{

if(yi + i < src_h)

{

in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);

568

}

569

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

570

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

571

// Transpose input tile

572

for(int k0 = 0; k0 < K0; ++k0)

573

{

574

for(int n0 = 0; n0 < N0; ++n0)

575

{

576

in_tr[n0].s[k0] = in[k0].s[n0];

577

}

578

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

579

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

580

TILE(uint, N0, 1, dst_indirect_y);

581

for(int i = 0; i < N0; ++i)

582

{

583

dst_indirect_y[i].v = i;

584

}

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

585

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

586

T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);

ramelg01

9cca592

2021-11-11 10:05:00 +0000

[diff] [blame]

587

588

#undef BLOCK_SIZE

589

#undef OUTPUT_OFFSET_X

590

#undef OUTPUT_STEP_X

591

}

Gian Marco Iodice

4fb5670

2021-11-10 11:18:50 +0000

[diff] [blame^]

592

593

#endif // defined(RESHAPE_RHS_T)