Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2019-07-19 09:54:47 +0100

[diff] [blame]

49

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

50

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

51

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

52

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

53

* @note Only the following values for M0, K0 and V0 are supported:

54

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

55

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

56

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

57

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

58

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

59

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

60

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

61

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

62

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

63

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame^]

64

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

65

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

66

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

67

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

68

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

69

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

70

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

71

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

72

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

73

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

74

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

75

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

76

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

77

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

78

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

79

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

80

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

81

*/

82

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

83

TENSOR3D_DECLARATION(dst)

84

#if defined(REINTERPRET_INPUT_AS_3D)

85

,

86

uint cross_plane_pad

87

#endif // REINTERPRET_INPUT_AS_3D

88

)

89

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

90

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

91

#define BLOCK_SIZE ((M0) * (K0))

92

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

93

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

#if defined(INTERLEAVE)

95

#define OUTPUT_OFFSET_X (K0)

96

#else // defined(INTERLEAVE)

97

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

98

#endif // defined(INTERLEAVE)

99

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

100

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

101

#if defined(INTERLEAVE)

102

#define OUTPUT_STEP_X (K0) * (V0)

103

#else // Do not interleave

104

#define OUTPUT_STEP_X (K0)

105

#endif // defined(INTERLEAVE)

106

107

// Compute source and destination addresses

108

uint x = get_global_id(0);

109

uint y = get_global_id(1);

110

uint z = get_global_id(2);

111

112

// ------------------ Compute input/output addresses ---------------------------

113

114

// Compute the input address

115

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

116

117

// Compute the output address

118

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

119

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

120

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

121

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

122

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

124

#if defined(REINTERPRET_INPUT_AS_3D)

125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

126

// multiply src_stride_z by DEPTH_GEMM3D

127

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

128

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

129

130

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

131

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

132

133

#else // defined(REINTERPRET_INPUT_AS_3D)

134

135

input_ptr += z * (uint)src_stride_z;

136

137

#endif // defined(REINTERPRET_INPUT_AS_3D)

138

139

// Add offset for batched GEMM

140

output_ptr += z * (uint)dst_stride_z;

141

142

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

143

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

144

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

145

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

146

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

147

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

148

#endif // M0 > 1

149

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

150

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

151

#endif // M0 > 2

152

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

153

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

154

#endif // M0 > 3

155

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

156

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

#endif // M0 > 4

158

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

159

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

160

#endif // M0 > 5

161

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

162

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

163

#endif // M0 > 6

164

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

165

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

166

#endif // M0 > 7

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

167

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

168

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

169

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

170

171

#undef BLOCK_SIZE

172

#undef OUTPUT_OFFSET_X

173

#undef OUTPUT_STEP_X

174

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

175

176

#if M0 == 2

177

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

178

({ \

179

VEC_DATA_TYPE(DATA_TYPE, M0) \

180

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

181

VSTORE(M0) \

182

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

183

})

184

#elif M0 == 3 // M0 == 3

185

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

186

({ \

187

VEC_DATA_TYPE(DATA_TYPE, M0) \

188

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

189

VSTORE(M0) \

190

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

191

})

192

#elif M0 == 4 // M0 == 4

193

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

194

({ \

195

VEC_DATA_TYPE(DATA_TYPE, M0) \

196

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

197

VSTORE(M0) \

198

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

199

})

200

#elif M0 == 5 // M0 == 5

201

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

202

({ \

203

VEC_DATA_TYPE(DATA_TYPE, 4) \

204

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

205

DATA_TYPE res1 = a4.s##i; \

206

VSTORE(4) \

207

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

208

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

209

})

210

#elif M0 == 6 // M0 == 6

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

VEC_DATA_TYPE(DATA_TYPE, 2) \

216

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

217

VSTORE(4) \

218

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

219

VSTORE(2) \

220

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

221

})

222

#elif M0 == 7 // M0 == 7

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 3) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(3) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 8 // M0 == 8

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, M0) \

238

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

239

VSTORE(M0) \

240

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

})

242

#else // M0 not supported

243

#error "M0 value not supported"

244

#endif // N0 conditions

245

246

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

247

* the output matrix unrolling the values.

248

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

249

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

250

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

251

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

252

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

253

* @note Only the following values for M0, K0 and V0 are supported:

254

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

255

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

256

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

257

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

258

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

259

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

260

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

261

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

262

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

263

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame^]

264

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

265

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

272

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

273

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

278

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

280

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

281

*/

282

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

283

TENSOR3D_DECLARATION(dst)

284

#if defined(REINTERPRET_INPUT_AS_3D)

285

,

286

uint cross_plane_pad

287

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

292

293

// Output offset X

294

#if defined(INTERLEAVE)

295

#define OUTPUT_OFFSET_X (M0)

296

#else // defined(INTERLEAVE)

297

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

298

#endif // defined(INTERLEAVE)

299

300

// Output step X

301

#if defined(INTERLEAVE)

302

#define OUTPUT_STEP_X (M0) * (V0)

303

#else // Do not interleave

304

#define OUTPUT_STEP_X (M0)

305

#endif // defined(INTERLEAVE)

306

307

// Compute source and destination addresses

308

uint x = get_global_id(0);

309

uint y = get_global_id(1);

310

uint z = get_global_id(2);

311

312

// ------------------ Compute input/output addresses ---------------------------

313

314

// Compute the input address

315

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

316

317

// Compute the output address

318

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

319

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

320

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

321

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

322

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

323

324

#if defined(REINTERPRET_INPUT_AS_3D)

325

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

326

// multiply src_stride_z by DEPTH_GEMM3D

327

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

328

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

329

330

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

331

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

332

333

#else // defined(REINTERPRET_INPUT_AS_3D)

334

335

input_ptr += z * (uint)src_stride_z;

336

337

#endif // defined(REINTERPRET_INPUT_AS_3D)

338

339

// Add offset for batched GEMM

340

output_ptr += z * (uint)dst_stride_z;

341

342

// ---------------------------Load input values --------------------------------

343

344

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

345

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

346

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

347

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

348

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

349

#endif // M0 > 1

350

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

351

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

352

#endif // M0 > 2

353

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

354

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

355

#endif // M0 > 3

356

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

357

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

358

#endif // M0 > 4

359

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

360

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

361

#endif // M0 > 5

362

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

363

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

364

#endif // M0 > 6

365

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

366

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

367

#endif // M0 > 7

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

368

// ---------------------------Transpose and store block -----------------------

369

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

372

#if K0 > 2

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

374

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

375

#if K0 > 3

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

377

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

378

#if K0 > 4

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

383

#endif // K0 > 4

384

#if K0 > 8

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

387

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

388

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

389

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

390

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

391

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

392

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

397

#undef OUTPUT_STEP_X

398

}

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

399

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

400

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

402

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

403

* the output matrix unrolling the values.

404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

405

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

407

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

408

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

409

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

410

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

411

* N0: 2,3,4,8,16

412

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

413

* H0: greater than 0

414

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame^]

415

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

416

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

423

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

424

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

429

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

431

*/

432

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

433

TENSOR3D_DECLARATION(dst))

434

{

435

// Block size

436

#define BLOCK_SIZE ((K0) * (N0))

437

438

// Output offset X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_OFFSET_X (N0)

441

#else // defined(INTERLEAVE)

442

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

443

#endif // defined(INTERLEAVE)

444

445

// Output step X

446

#if defined(INTERLEAVE)

447

#define OUTPUT_STEP_X (N0) * (H0)

448

#else // Do not interleave

449

#define OUTPUT_STEP_X (N0)

450

#endif // defined(INTERLEAVE)

451

452

// Compute source and destination addresses

453

uint x = get_global_id(0);

454

uint y = get_global_id(1);

455

uint z = get_global_id(2);

456

457

// ------------------ Compute input/output addresses ---------------------------

458

459

// Compute the input address

460

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

461

462

// Compute the output address

463

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

464

x / (uint)H0)

465

* (uint)dst_stride_y)

466

+ z * (uint)dst_stride_z;

467

468

// ---------------------------Load input values --------------------------------

469

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

470

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

471

472

// Load values from the RHS matrix

473

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

474

#if K0 > 1

475

if(y * (uint)K0 + 1 < SRC_HEIGHT)

476

{

477

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

482

{

483

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

484

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

485

#endif // K0 > 2

486

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

487

if(y * (uint)K0 + 3 < SRC_HEIGHT)

488

{

489

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

490

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

491

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

492

#if K0 > 4

493

if(y * (uint)K0 + 4 < SRC_HEIGHT)

494

{

495

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

496

}

497

if(y * (uint)K0 + 5 < SRC_HEIGHT)

498

{

499

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

500

}

501

if(y * (uint)K0 + 6 < SRC_HEIGHT)

502

{

503

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

504

}

505

if(y * (uint)K0 + 7 < SRC_HEIGHT)

506

{

507

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

508

}

509

#endif // K0 > 4

510

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

511

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

512

{

513

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

514

}

515

if(y * (uint)K0 + 9 < SRC_HEIGHT)

516

{

517

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

518

}

519

if(y * (uint)K0 + 10 < SRC_HEIGHT)

520

{

521

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

522

}

523

if(y * (uint)K0 + 11 < SRC_HEIGHT)

524

{

525

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

526

}

527

if(y * (uint)K0 + 12 < SRC_HEIGHT)

528

{

529

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

530

}

531

if(y * (uint)K0 + 13 < SRC_HEIGHT)

532

{

533

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

534

}

535

if(y * (uint)K0 + 14 < SRC_HEIGHT)

536

{

537

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

538

}

539

if(y * (uint)K0 + 15 < SRC_HEIGHT)

540

{

541

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

546

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

547

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

548

549

#undef BLOCK_SIZE

550

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

555

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

556

* the output matrix unrolling the values.

557

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

558

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

559

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

560

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

561

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

563

* @note The option -DTRANSPOSE must passed at compile time.

564

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

565

* N0: 2,3,4,8,16

566

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

567

* H0: greater than 0

568

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame^]

569

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

570

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

571

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

573

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

577

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

582

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

583

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

584

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

585

*/

586

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

587

TENSOR3D_DECLARATION(dst))

588

{

589

// Block size

590

#define BLOCK_SIZE ((K0) * (N0))

591

592

// Output offset X

593

#if defined(INTERLEAVE)

594

#define OUTPUT_OFFSET_X (K0)

595

#else // defined(INTERLEAVE)

596

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

597

#endif // defined(INTERLEAVE)

598

599

// Output step X

600

#if defined(INTERLEAVE)

601

#define OUTPUT_STEP_X (K0) * (H0)

602

#else // Do not interleave

603

#define OUTPUT_STEP_X (K0)

604

#endif // defined(INTERLEAVE)

605

606

// Compute source and destination addresses

607

uint x = get_global_id(0);

608

uint y = get_global_id(1);

609

uint z = get_global_id(2);

610

611

// ------------------ Compute input/output addresses ---------------------------

612

613

// Compute the input address

614

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

615

616

// Compute the output address

617

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

618

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

619

620

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

621

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

623

// Load values from the RHS matrix

624

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

625

if(y * (uint)K0 + 1 < SRC_HEIGHT)

626

{

627

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

628

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

629

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

630

if(y * (uint)K0 + 2 < SRC_HEIGHT)

631

{

632

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

633

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

634

#endif // K0 > 2

635

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

636

if(y * (uint)K0 + 3 < SRC_HEIGHT)

637

{

638

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

639

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

640

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

641

#if K0 > 4

642

if(y * (uint)K0 + 4 < SRC_HEIGHT)

643

{

644

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

645

}

646

if(y * (uint)K0 + 5 < SRC_HEIGHT)

647

{

648

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

649

}

650

if(y * (uint)K0 + 6 < SRC_HEIGHT)

651

{

652

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

653

}

654

if(y * (uint)K0 + 7 < SRC_HEIGHT)

655

{

656

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

657

}

658

#endif // K0 > 4

659

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

660

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

661

{

662

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

663

}

664

if(y * (uint)K0 + 9 < SRC_HEIGHT)

665

{

666

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

667

}

668

if(y * (uint)K0 + 10 < SRC_HEIGHT)

669

{

670

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

671

}

672

if(y * (uint)K0 + 11 < SRC_HEIGHT)

673

{

674

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

675

}

676

if(y * (uint)K0 + 12 < SRC_HEIGHT)

677

{

678

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

679

}

680

if(y * (uint)K0 + 13 < SRC_HEIGHT)

681

{

682

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

683

}

684

if(y * (uint)K0 + 14 < SRC_HEIGHT)

685

{

686

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

687

}

688

if(y * (uint)K0 + 15 < SRC_HEIGHT)

689

{

690

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

695

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

696

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

697

#if K0 == 2

698

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

704

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

705

#if N0 > 2

706

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

707

#endif // N0 > 2

708

#if N0 > 3

709

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

710

#endif // N0 > 3

711

#if N0 > 4

712

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

713

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

714

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

715

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

716

#endif // N0 > 4

717

#if N0 > 8

718

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

719

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

720

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

721

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

722

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

723

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

724

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

725

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

726

#endif // N0 > 8

727

728

#elif K0 == 3 // K0 == 2

729

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

734

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

735

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

736

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

737

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

738

#endif // N0 > 2

739

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

740

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

741

#endif // N0 > 3

742

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

743

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

744

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

745

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

746

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

747

#endif // N0 > 4

748

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

749

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

750

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

751

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

752

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

753

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

754

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

755

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

756

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

757

#endif // N0 > 8

758

759

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

760

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

766

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

767

#if N0 > 2

768

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

769

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

770

#if N0 > 3

771

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

772

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

773

#if N0 > 4

774

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

775

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

776

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

777

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

778

#endif // N0 > 4

779

#if N0 > 8

780

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

781

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

782

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

783

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

784

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

785

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

786

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

787

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

788

#endif // N0 > 8

789

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

790

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

791

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

796

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

797

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

798

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

799

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

800

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

801

#if N0 > 3

802

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

803

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

804

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

805

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

806

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

807

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

808

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

809

#endif // N0 > 4

810

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

811

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

812

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

813

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

814

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

815

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

816

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

817

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

818

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

819

#endif // N0 > 8

820

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

821

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

822

823

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

829

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

830

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

831

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

832

#if N0 > 2

833

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

834

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

835

#endif // N0 > 2

836

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

837

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

838

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

839

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

840

#if N0 > 4

841

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

842

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

843

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

844

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

845

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

846

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

847

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

848

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

849

#endif // N0 > 4

850

#if N0 > 8

851

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

852

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

853

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

854

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

855

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

856

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

857

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

858

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

859

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

860

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

861

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

862

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

863

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

864

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

865

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

866

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

871

#endif // N0 > 2

872

873

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

874

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

875

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

876

877

#undef BLOCK_SIZE

878

#undef OUTPUT_OFFSET_X

879

#undef OUTPUT_STEP_X

880

}

881

#endif // defined(TRANSPOSE)

882

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

883

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

884

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

885

886

#define CONCAT(a, b) a##b

887

888

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

893

({ \

894

c = fma(a.s0, b.s0, c); \

895

c = fma(a.s1, b.s1, c); \

896

})

897

#define ARM_DOT3(a, b, c) \

898

({ \

899

ARM_DOT2(a, b, c); \

900

c = fma((a.s2), (b.s2), c); \

901

})

902

#define ARM_DOT4(a, b, c) \

903

({ \

904

ARM_DOT3(a, b, c); \

905

c = fma((a.s3), (b.s3), c); \

906

})

907

#define ARM_DOT8(a, b, c) \

908

({ \

909

ARM_DOT4((a.lo), (b.lo), c); \

910

ARM_DOT4((a.hi), (b.hi), c); \

911

})

912

#define ARM_DOT16(a, b, c) \

913

({ \

914

ARM_DOT8((a.lo), (b.lo), c); \

915

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

})

926

#elif N0 == 3 // N0 == 3

927

#define ARM_DOT_K0XN0(k0, a, b, c) \

928

({ \

929

CONCAT(ARM_DOT, k0) \

930

((a), (b##0), (c.s0)); \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##1), (c.s1)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##2), (c.s2)); \

935

})

936

#elif N0 == 4 // N0 == 4

937

#define ARM_DOT_K0XN0(k0, a, b, c) \

938

({ \

939

CONCAT(ARM_DOT, k0) \

940

((a), (b##0), (c.s0)); \

941

CONCAT(ARM_DOT, k0) \

942

((a), (b##1), (c.s1)); \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##2), (c.s2)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##3), (c.s3)); \

947

})

948

#elif N0 == 8 // N0 == 8

949

#define ARM_DOT_K0XN0(k0, a, b, c) \

950

({ \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##0), (c.s0)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##1), (c.s1)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##2), (c.s2)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##3), (c.s3)); \

959

CONCAT(ARM_DOT, k0) \

960

((a), (b##4), (c.s4)); \

961

CONCAT(ARM_DOT, k0) \

962

((a), (b##5), (c.s5)); \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##6), (c.s6)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##7), (c.s7)); \

967

})

968

#elif N0 == 16 // N0 == 16

969

#define ARM_DOT_K0XN0(k0, a, b, c) \

970

({ \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##0), (c.s0)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##1), (c.s1)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##2), (c.s2)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##3), (c.s3)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##4), (c.s4)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##5), (c.s5)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##6), (c.s6)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##7), (c.s7)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##8), (c.s8)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##9), (c.s9)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##A), (c.sA)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##B), (c.sB)); \

995

CONCAT(ARM_DOT, k0) \

996

((a), (b##C), (c.sC)); \

997

CONCAT(ARM_DOT, k0) \

998

((a), (b##D), (c.sD)); \

999

CONCAT(ARM_DOT, k0) \

1000

((a), (b##E), (c.sE)); \

1001

CONCAT(ARM_DOT, k0) \

1002

((a), (b##F), (c.sF)); \

1003

})

1004

#else // N0 not supported

1005

#error "N0 value not supported"

1006

#endif // N0 conditions

1007

1008

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1009

* The LHS matrix is NOT reshaped

1010

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1011

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1012

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1013

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1014

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1015

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1017

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1019

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1020

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1021

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1022

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1023

* - N0 = 2, 3, 4, 8, 16

1024

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1025

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1026

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1027

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1028

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1029

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1030

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1031

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1032

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1033

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1034

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1035

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1036

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1037

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1038

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1039

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1040

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1041

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1042

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1043

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1044

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1045

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1046

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1047

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1048

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1049

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1050

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1051

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1052

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1053

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1054

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1055

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1056

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1057

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1058

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1059

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1060

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1062

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1063

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1064

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1065

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1066

*/

1067

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1068

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1069

#if defined(BETA)

1070

IMAGE_DECLARATION(bias),

1071

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1072

IMAGE_DECLARATION(dst),

1073

uint lhs_stride_z,

1074

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1075

#if defined(BETA)

1076

uint bias_stride_z,

1077

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1078

uint dst_stride_z

1079

#if defined(REINTERPRET_INPUT_AS_3D)

1080

,

1081

uint lhs_cross_plane_pad

1082

#endif // REINTERPRET_INPUT_AS_3D

1083

#if defined(REINTERPRET_OUTPUT_AS_3D)

1084

,

1085

uint dst_cross_plane_pad

1086

#endif // REINTERPRET_OUTPUT_AS_3D

1087

)

1088

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1089

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1090

#define RHS_BLOCK_SIZE ((K0) * (N0))

1091

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1092

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1093

#if defined(RHS_INTERLEAVE)

1094

#define RHS_OFFSET_X (K0)

1095

#define RHS_STEP_X ((K0) * (H0))

1096

#define RHS_STEP_LOOP (1)

1097

#else // defined(RHS_INTERLEAVE)

1098

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1099

#define RHS_STEP_X (K0)

1100

#define RHS_STEP_LOOP (H0)

1101

#endif // defined(RHS_INTERLEAVE)

1102

1103

uint x = get_global_id(0);

1104

uint y = get_global_id(1);

1105

uint z = get_global_id(2);

1106

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1107

#if defined(DUMMY_WORK_ITEMS)

1108

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1113

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1114

// Compute LHS matrix address

1115

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1116

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1117

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1118

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1119

1120

#if defined(MATRIX_B_DEPTH)

1121

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1122

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1123

#else // defined(MATRIX_B_DEPTH)

1124

rhs_offset += z * rhs_stride_z;

1125

#endif // defined(MATRIX_B_DEPTH)

1126

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1127

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1128

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1129

1130

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1131

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1132

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1133

1134

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1135

// multiply lhs_stride_z by DEPTH_GEMM3D

1136

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1137

1138

#else // defined(REINTERPRET_INPUT_AS_3D)

1139

1140

// Add offset for batched GEMM

1141

lhs_offset += z * lhs_stride_z;

1142

1143

#endif // defined(REINTERPRET_INPUT_AS_3D)

1144

1145

// Initialize the accumulators

1146

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1147

1148

int i = 0;

1149

for(; i <= (K - K0); i += K0)

1150

{

1151

// Supported cases (M0, K0):

1152

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1153

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1154

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1155

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1156

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1157

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1158

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1159

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1160

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1161

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1162

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1163

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1164

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1165

1166

// Accumulate

1167

ARM_DOT_K0XN0(K0, a0, b, c0);

1168

#if M0 > 1

1169

ARM_DOT_K0XN0(K0, a1, b, c1);

1170

#endif // M0 > 1

1171

#if M0 > 2

1172

ARM_DOT_K0XN0(K0, a2, b, c2);

1173

#endif // M0 > 2

1174

#if M0 > 3

1175

ARM_DOT_K0XN0(K0, a3, b, c3);

1176

#endif // M0 > 3

1177

#if M0 > 4

1178

ARM_DOT_K0XN0(K0, a4, b, c4);

1179

#endif // M0 > 4

1180

#if M0 > 5

1181

ARM_DOT_K0XN0(K0, a5, b, c5);

1182

#endif // M0 > 5

1183

#if M0 > 6

1184

ARM_DOT_K0XN0(K0, a6, b, c6);

1185

#endif // M0 > 6

1186

#if M0 > 7

1187

ARM_DOT_K0XN0(K0, a7, b, c7);

1188

#endif // M0 > 7

1189

1190

lhs_offset += K0 * sizeof(DATA_TYPE);

1191

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1192

}

1193

1194

// Left-over accumulations

1195

for(; i < K; ++i)

1196

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1197

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1198

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1199

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1200

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1201

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1202

1203

// Accumulate

1204

ARM_DOT_K0XN0(1, a0, b, c0);

1205

#if M0 > 1

1206

ARM_DOT_K0XN0(1, a1, b, c1);

1207

#endif // M0 > 1

1208

#if M0 > 2

1209

ARM_DOT_K0XN0(1, a2, b, c2);

1210

#endif // M0 > 2

1211

#if M0 > 3

1212

ARM_DOT_K0XN0(1, a3, b, c3);

1213

#endif // M0 > 3

1214

#if M0 > 4

1215

ARM_DOT_K0XN0(1, a4, b, c4);

1216

#endif // M0 > 4

1217

#if M0 > 5

1218

ARM_DOT_K0XN0(1, a5, b, c5);

1219

#endif // M0 > 5

1220

#if M0 > 6

1221

ARM_DOT_K0XN0(1, a6, b, c6);

1222

#endif // M0 > 6

1223

#if M0 > 7

1224

ARM_DOT_K0XN0(1, a7, b, c7);

1225

#endif // M0 > 7

1226

1227

lhs_offset += sizeof(DATA_TYPE);

1228

rhs_offset += sizeof(DATA_TYPE);

1229

}

1230

1231

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1232

1233

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1234

1235

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1236

1237

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1238

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1239

1240

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1241

// multiply dst_stride_z by DEPTH_GEMM3D

1242

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1243

1244

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1245

1246

// Add offset for batched GEMM

1247

dst_addr += z * dst_stride_z;

1248

1249

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1250

1251

// Multiply by the weight of matrix-matrix product and store the result

1252

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1253

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1254

#endif // defined(ALPHA)

1255

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1256

// Add beta*bias

1257

#if defined(BETA)

1258

#if defined(BROADCAST_BIAS)

1259

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1260

1261

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1262

1263

#ifndef UNIT_BETA

1264

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1265

#endif // UNIT_BIAS

1266

1267

// c = c + bias[broadcasted]

1268

ADD_BLOCK_BROADCAST(M0, c, bias0);

1269

1270

#else // defined(BROADCAST_BIAS)

1271

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1272

2) * bias_stride_z;

1273

1274

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1275

1276

#ifndef UNIT_BETA

1277

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1282

1283

#endif // defined(BROADCAST_BIAS)

1284

#endif // defined(BETA)

1285

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1286

#if defined(ACTIVATION_TYPE)

1287

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1288

#endif // defined(ACTIVATION_TYPE)

1289

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1290

// Store output block

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1291

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1292

1293

#undef RHS_BLOCK_SIZE

1294

#undef RHS_OFFSET_X

1295

#undef RHS_STEP_X

1296

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1297

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1298

#if defined(OPENCL_IMAGE_SUPPORT)

1299

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1300

* The LHS matrix is NOT reshaped

1301

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1302

*

1303

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1304

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1305

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1306

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1307

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1308

* could be different from the value returned by get_image_height(rhs_img).

1309

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1310

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1311

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1312

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1313

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1314

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1315

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1316

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1322

* The activation function is performed after the bias addition

1323

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1324

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1325

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1326

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1327

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1328

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1329

*

1330

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1331

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1332

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1333

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1334

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1335

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1336

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1337

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1338

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1339

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1340

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1341

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1342

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1343

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1344

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1345

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1346

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1347

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1348

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1349

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1350

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1351

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1352

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1353

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1354

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1355

*/

1356

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1357

__read_only image2d_t rhs_img,

1358

#if defined(BETA)

1359

IMAGE_DECLARATION(bias),

1360

#endif // defined(BETA)

1361

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1367

uint dst_stride_z

1368

#if defined(REINTERPRET_INPUT_AS_3D)

1369

,

1370

uint lhs_cross_plane_pad

1371

#endif // REINTERPRET_INPUT_AS_3D

1372

#if defined(REINTERPRET_OUTPUT_AS_3D)

1373

,

1374

uint dst_cross_plane_pad

1375

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1380

1381

#define LEFTOVER_K (K % K0)

1382

1383

// Block size

1384

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1385

1386

// RHS offset and step X

1387

#if defined(RHS_INTERLEAVE)

1388

#define RHS_OFFSET_X (PIXEL_UNIT)

1389

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1390

#define RHS_STEP_LOOP (1)

1391

#else // defined(RHS_INTERLEAVE)

1392

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1393

#define RHS_STEP_X PIXEL_UNIT

1394

#define RHS_STEP_LOOP (H0)

1395

#endif // defined(RHS_INTERLEAVE)

1396

1397

uint x = get_global_id(0);

1398

uint y = get_global_id(1);

1399

uint z = get_global_id(2);

1400

1401

#if defined(DUMMY_WORK_ITEMS)

1402

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1407

1408

// Compute LHS matrix address

1409

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1410

1411

#if defined(MATRIX_B_DEPTH)

1412

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1413

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1414

#else // defined(MATRIX_B_DEPTH)

1415

const uint z_rhs = get_global_id(2);

1416

#endif // defined(MATRIX_B_DEPTH)

1417

1418

// Compute RHS matrix coordinates

1419

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1420

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1421

1422

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1423

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1424

1425

#if defined(REINTERPRET_INPUT_AS_3D)

1426

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1427

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1428

1429

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1430

// multiply lhs_stride_z by DEPTH_GEMM3D

1431

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1432

1433

#else // defined(REINTERPRET_INPUT_AS_3D)

1434

1435

// Add offset for batched GEMM

1436

lhs_offset += z * lhs_stride_z;

1437

1438

#endif // defined(REINTERPRET_INPUT_AS_3D)

1439

1440

// Initialize the accumulators

1441

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1442

1443

int i = 0;

1444

for(; i <= (K - K0); i += K0)

1445

{

1446

// Load values from LHS matrix

1447

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1448

1449

// Load values from RHS matrix stored in a cl_image

1450

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1451

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1452

1453

// Accumulate

1454

ARM_DOT_K0XN0(K0, a0, b, c0);

1455

#if M0 > 1

1456

ARM_DOT_K0XN0(K0, a1, b, c1);

1457

#endif // M0 > 1

1458

#if M0 > 2

1459

ARM_DOT_K0XN0(K0, a2, b, c2);

1460

#endif // M0 > 2

1461

#if M0 > 3

1462

ARM_DOT_K0XN0(K0, a3, b, c3);

1463

#endif // M0 > 3

1464

#if M0 > 4

1465

ARM_DOT_K0XN0(K0, a4, b, c4);

1466

#endif // M0 > 4

1467

#if M0 > 5

1468

ARM_DOT_K0XN0(K0, a5, b, c5);

1469

#endif // M0 > 5

1470

#if M0 > 6

1471

ARM_DOT_K0XN0(K0, a6, b, c6);

1472

#endif // M0 > 6

1473

#if M0 > 7

1474

ARM_DOT_K0XN0(K0, a7, b, c7);

1475

#endif // M0 > 7

1476

1477

lhs_offset += K0 * sizeof(DATA_TYPE);

1478

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1483

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1493

#if M0 > 1

1494

union UNION_VEC_TYPE a1 = {.v = 0 };

1495

#endif // M0 > 1

1496

#if M0 > 2

1497

union UNION_VEC_TYPE a2 = {.v = 0 };

1498

#endif // M0 > 2

1499

#if M0 > 3

1500

union UNION_VEC_TYPE a3 = {.v = 0 };

1501

#endif // M0 > 3

1502

#if M0 > 4

1503

union UNION_VEC_TYPE a4 = {.v = 0 };

1504

#endif // M0 > 4

1505

#if M0 > 5

1506

union UNION_VEC_TYPE a5 = {.v = 0 };

1507

#endif // M0 > 5

1508

#if M0 > 6

1509

union UNION_VEC_TYPE a6 = {.v = 0 };

1510

#endif // M0 > 6

1511

#if M0 > 7

1512

union UNION_VEC_TYPE a7 = {.v = 0 };

1513

#endif // M0 > 7

1514

1515

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1516

1517

// Load from RHS matrix

1518

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1519

1520

// Load from LHS matrix

1521

for(int k = 0; k < LEFTOVER_K; ++k)

1522

{

1523

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1524

#if M0 > 1

1525

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1526

#endif // M0 > 1

1527

#if M0 > 2

1528

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1529

#endif // M0 > 2

1530

#if M0 > 3

1531

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1532

#endif // M0 > 3

1533

#if M0 > 4

1534

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1535

#endif // M0 > 4

1536

#if M0 > 5

1537

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1538

#endif // M0 > 5

1539

#if M0 > 6

1540

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1541

#endif // M0 > 6

1542

#if M0 > 7

1543

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1544

#endif // M0 > 7

1545

1546

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1551

#if M0 > 1

1552

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1553

#endif // M0 > 1

1554

#if M0 > 2

1555

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1556

#endif // M0 > 2

1557

#if M0 > 3

1558

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1559

#endif // M0 > 3

1560

#if M0 > 4

1561

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1562

#endif // M0 > 4

1563

#if M0 > 5

1564

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1565

#endif // M0 > 5

1566

#if M0 > 6

1567

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1568

#endif // M0 > 6

1569

#if M0 > 7

1570

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1571

#endif // M0 > 7

1572

1573

#endif // LEFTOVER_K != 0

1574

1575

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1576

1577

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1578

1579

#if defined(REINTERPRET_OUTPUT_AS_3D)

1580

1581

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1582

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1583

1584

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1585

// multiply dst_stride_z by DEPTH_GEMM3D

1586

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1587

1588

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1589

1590

// Add offset for batched GEMM

1591

dst_addr += z * dst_stride_z;

1592

1593

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1594

1595

// Multiply by the weight of matrix-matrix product and store the result

1596

#if defined(ALPHA)

1597

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1598

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1603

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1604

1605

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1606

1607

#ifndef UNIT_BETA

1608

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1609

#endif // UNIT_BIAS

1610

1611

// c = c + bias[broadcasted]

1612

ADD_BLOCK_BROADCAST(M0, c, bias0);

1613

1614

#else // defined(BROADCAST_BIAS)

1615

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1616

2) * bias_stride_z;

1617

1618

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1619

1620

#ifndef UNIT_BETA

1621

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1626

1627

#endif // defined(BROADCAST_BIAS)

1628

#endif // defined(BETA)

1629

1630

#if defined(ACTIVATION_TYPE)

1631

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1632

#endif // defined(ACTIVATION_TYPE)

1633

1634

// Store output block

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1635

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1636

1637

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1644

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1645

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1651

#define VFMA_M0xN0(i, a, b, c) \

1652

({ \

1653

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1654

})

1655

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1656

#define VFMA_M0xN0(i, a, b, c) \

1657

({ \

1658

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1659

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1660

})

1661

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1662

#define VFMA_M0xN0(i, a, b, c) \

1663

({ \

1664

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1665

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1666

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1667

})

1668

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1669

#define VFMA_M0xN0(i, a, b, c) \

1670

({ \

1671

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1672

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1673

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1674

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1675

})

1676

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1677

#define VFMA_M0xN0(i, a, b, c) \

1678

({ \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1680

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1681

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1682

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1683

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1684

})

1685

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1686

#define VFMA_M0xN0(i, a, b, c) \

1687

({ \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1690

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1691

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1692

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1693

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1694

})

1695

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1696

#define VFMA_M0xN0(i, a, b, c) \

1697

({ \

1698

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1701

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1702

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1703

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1704

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1705

})

1706

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1707

#define VFMA_M0xN0(i, a, b, c) \

1708

({ \

1709

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1710

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1711

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1712

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1713

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1714

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1715

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1716

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1717

})

1718

#else // M0 not supported

1719

#error "M0 not supported"

1720

#endif // M0 not supported

1721

1722

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1723

* The LHS matrix is NOT reshaped

1724

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1725

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1726

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1727

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1728

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1729

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1730

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1731

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1732

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1733

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1734

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1735

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1736

* - N0 = 2, 3, 4, 8, 16

1737

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1738

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1739

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1740

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1741

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1742

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1743

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1744

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1745

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1746

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1747

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1748

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1749

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1750

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1751

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1752

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1753

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1754

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1755

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1756

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1757

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1758

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1759

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1760

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1761

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1762

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1763

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1764

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1765

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1766

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1767

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1768

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1769

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1770

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1771

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1772

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1773

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1774

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1775

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1776

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1777

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1778

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1779

*/

1780

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1781

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1782

#if defined(BETA)

1783

IMAGE_DECLARATION(bias),

1784

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1785

IMAGE_DECLARATION(dst),

1786

uint lhs_stride_z,

1787

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1788

#if defined(BETA)

1789

uint bias_stride_z,

1790

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1791

uint dst_stride_z

1792

#if defined(REINTERPRET_INPUT_AS_3D)

1793

,

1794

uint lhs_cross_plane_pad

1795

#endif // REINTERPRET_INPUT_AS_3D

1796

#if defined(REINTERPRET_OUTPUT_AS_3D)

1797

,

1798

uint dst_cross_plane_pad

1799

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1804

1805

// RHS offset and step X

1806

#if defined(RHS_INTERLEAVE)

1807

#define RHS_OFFSET_X (N0)

1808

#define RHS_STEP_X ((N0) * (H0))

1809

#define RHS_STEP_LOOP (1)

1810

#else // defined(RHS_INTERLEAVE)

1811

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1812

#define RHS_STEP_X (N0)

1813

#define RHS_STEP_LOOP (H0)

1814

#endif // defined(RHS_INTERLEAVE)

1815

1816

uint x = get_global_id(0);

1817

uint y = get_global_id(1);

1818

uint z = get_global_id(2);

1819

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1820

#if defined(DUMMY_WORK_ITEMS)

1821

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1826

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1827

// Compute LHS matrix address

1828

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1829

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1830

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1831

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1832

1833

#if defined(MATRIX_B_DEPTH)

1834

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1835

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1836

#else // defined(MATRIX_B_DEPTH)

1837

rhs_offset += z * rhs_stride_z;

1838

#endif // defined(MATRIX_B_DEPTH)

1839

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1840

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1841

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1842

1843

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1844

1845

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1846

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1847

1848

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1849

// multiply lhs_stride_z by DEPTH_GEMM3D

1850

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1851

1852

#else // defined(REINTERPRET_INPUT_AS_3D)

1853

1854

// Add offset for batched GEMM

1855

lhs_offset += z * lhs_stride_z;

1856

1857

#endif // defined(REINTERPRET_INPUT_AS_3D)

1858

1859

// Initialize the accumulators

1860

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1861

1862

int i = 0;

1863

for(; i <= (K - K0); i += K0)

1864

{

1865

// Supported cases (M0, K0):

1866

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1867

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1868

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1869

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1870

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1871

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1872

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1873

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1874

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1875

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1876

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1877

VEC_DATA_TYPE(DATA_TYPE, N0)

1878

b0;

1879

1880

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1881

VFMA_M0xN0(0, a, b0, c);

1882

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1883

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1884

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1885

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1886

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1887

#endif // K0 > 2

1888

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1889

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1890

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1891

#endif // K0 > 3

1892

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(4, a, b0, c);

1895

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1896

VFMA_M0xN0(5, a, b0, c);

1897

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1898

VFMA_M0xN0(6, a, b0, c);

1899

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1900

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1901

#endif // K0 > 4

1902

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(8, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(9, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(A, a, b0, c);

1909

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1910

VFMA_M0xN0(B, a, b0, c);

1911

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1912

VFMA_M0xN0(C, a, b0, c);

1913

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1914

VFMA_M0xN0(D, a, b0, c);

1915

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1916

VFMA_M0xN0(E, a, b0, c);

1917

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1918

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1919

#endif // K0 > 8

1920

1921

lhs_offset += K0 * sizeof(DATA_TYPE);

1922

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1923

}

1924

1925

// Left-over accumulations

1926

for(; i < K; ++i)

1927

{

1928

// Load values from LHS matrix

1929

VEC_DATA_TYPE(DATA_TYPE, 2)

1930

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1931

#if M0 > 1

1932

VEC_DATA_TYPE(DATA_TYPE, 2)

1933

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1934

#endif // M0 > 1

1935

#if M0 > 2

1936

VEC_DATA_TYPE(DATA_TYPE, 2)

1937

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1938

#endif // M0 > 2

1939

#if M0 > 3

1940

VEC_DATA_TYPE(DATA_TYPE, 2)

1941

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1942

#endif // M0 > 3

1943

#if M0 > 4

1944

VEC_DATA_TYPE(DATA_TYPE, 2)

1945

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1946

#endif // M0 > 4

1947

#if M0 > 5

1948

VEC_DATA_TYPE(DATA_TYPE, 2)

1949

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1950

#endif // M0 > 5

1951

#if M0 > 6

1952

VEC_DATA_TYPE(DATA_TYPE, 2)

1953

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1954

#endif // M0 > 6

1955

#if M0 > 7

1956

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1957

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1958

#endif // M0 > 7

1959

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1960

VEC_DATA_TYPE(DATA_TYPE, N0)

1961

b0;

1962

1963

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1964

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1965

1966

lhs_offset += sizeof(DATA_TYPE);

1967

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1968

}

1969

1970

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1971

1972

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1973

1974

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1975

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1976

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1977

1978

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1979

// multiply dst_stride_z by DEPTH_GEMM3D

1980

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1981

1982

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1983

1984

// Add offset for batched GEMM

1985

dst_addr += z * dst_stride_z;

1986

1987

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1988

1989

// Multiply by the weight of matrix-matrix product and store the result

1990

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1991

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1992

#endif // defined(ALPHA)

1993

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1994

// Add beta*bias

1995

#if defined(BETA)

1996

#if defined(BROADCAST_BIAS)

1997

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1998

1999

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2000

2001

#ifndef UNIT_BETA

2002

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2003

#endif // UNIT_BIAS

2004

2005

// c = c + bias[broadcasted]

2006

ADD_BLOCK_BROADCAST(M0, c, bias0);

2007

2008

#else // defined(BROADCAST_BIAS)

2009

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2010

2) * bias_stride_z;

2011

2012

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2013

2014

#ifndef UNIT_BETA

2015

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2020

2021

#endif // defined(BROADCAST_BIAS)

2022

#endif // defined(BETA)

2023

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2024

#if defined(ACTIVATION_TYPE)

2025

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2026

#endif // defined(ACTIVATION_TYPE)

2027

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2028

// Store output block

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2029

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2030

2031

#undef RHS_BLOCK_SIZE

2032

#undef RHS_OFFSET_X

2033

#undef RHS_STEP_X

2034

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2035

2036

#if defined(OPENCL_IMAGE_SUPPORT)

2037

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2038

* The LHS matrix is NOT reshaped

2039

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2040

*

2041

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2042

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2043

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2044

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2045

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2046

* could be different from the value returned by get_image_height(rhs_img).

2047

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2048

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2049

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2050

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2051

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2052

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2053

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2054

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2060

* The activation function is performed after the bias addition

2061

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2062

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2063

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2064

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2065

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2066

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2067

*

2068

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2069

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2070

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2071

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2072

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2073

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2074

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2075

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2076

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2077

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2078

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2079

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2080

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2081

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2082

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2083

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2084

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2085

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2086

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2087

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2088

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2089

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2090

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2091

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2092

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2093

*/

2094

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2095

__read_only image2d_t rhs_img,

2096

#if defined(BETA)

2097

IMAGE_DECLARATION(bias),

2098

#endif // defined(BETA)

2099

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2105

uint dst_stride_z

2106

#if defined(REINTERPRET_INPUT_AS_3D)

2107

,

2108

uint lhs_cross_plane_pad

2109

#endif // REINTERPRET_INPUT_AS_3D

2110

#if defined(REINTERPRET_OUTPUT_AS_3D)

2111

,

2112

uint dst_cross_plane_pad

2113

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2118

2119

// Block size

2120

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2121

2122

// RHS offset and step X

2123

#if defined(RHS_INTERLEAVE)

2124

#define RHS_OFFSET_X (PIXEL_UNIT)

2125

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2126

#else // defined(RHS_INTERLEAVE)

2127

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2128

#define RHS_STEP_X (PIXEL_UNIT)

2129

#endif // defined(RHS_INTERLEAVE)

2130

2131

uint x = get_global_id(0);

2132

uint y = get_global_id(1);

2133

uint z = get_global_id(2);

2134

2135

#if defined(DUMMY_WORK_ITEMS)

2136

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2141

2142

// Compute LHS matrix address

2143

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

2144

2145

#if defined(MATRIX_B_DEPTH)

2146

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2147

const uint z_rhs = (z % MATRIX_B_DEPTH);

2148

#else // defined(MATRIX_B_DEPTH)

2149

const uint z_rhs = z;

2150

#endif // defined(MATRIX_B_DEPTH)

2151

2152

// Compute RHS matrix coordinates

2153

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2154

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2155

2156

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2157

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2158

2159

#if defined(REINTERPRET_INPUT_AS_3D)

2160

2161

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2162

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2163

2164

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2165

// multiply lhs_stride_z by DEPTH_GEMM3D

2166

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2167

2168

#else // defined(REINTERPRET_INPUT_AS_3D)

2169

2170

// Add offset for batched GEMM

2171

lhs_offset += z * lhs_stride_z;

2172

2173

#endif // defined(REINTERPRET_INPUT_AS_3D)

2174

2175

// Initialize the accumulators

2176

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2177

2178

int i = 0;

2179

for(; i <= (K - K0); i += K0)

2180

{

2181

// Load values from LHS matrix

2182

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2183

2184

VEC_DATA_TYPE(DATA_TYPE, N0)

2185

b0;

2186

2187

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2188

VFMA_M0xN0(0, a, b0, c);

2189

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2190

VFMA_M0xN0(1, a, b0, c);

2191

#if K0 > 2

2192

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2193

VFMA_M0xN0(2, a, b0, c);

2194

#endif // K0 > 2

2195

#if K0 > 3

2196

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2197

VFMA_M0xN0(3, a, b0, c);

2198

#endif // K0 > 3

2199

#if K0 > 4

2200

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2201

VFMA_M0xN0(4, a, b0, c);

2202

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2203

VFMA_M0xN0(5, a, b0, c);

2204

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2205

VFMA_M0xN0(6, a, b0, c);

2206

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2207

VFMA_M0xN0(7, a, b0, c);

2208

#endif // K0 > 4

2209

#if K0 > 8

2210

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2211

VFMA_M0xN0(8, a, b0, c);

2212

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2213

VFMA_M0xN0(9, a, b0, c);

2214

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2215

VFMA_M0xN0(A, a, b0, c);

2216

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2217

VFMA_M0xN0(B, a, b0, c);

2218

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2219

VFMA_M0xN0(C, a, b0, c);

2220

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2221

VFMA_M0xN0(D, a, b0, c);

2222

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2223

VFMA_M0xN0(E, a, b0, c);

2224

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2225

VFMA_M0xN0(F, a, b0, c);

2226

#endif // K0 > 8

2227

2228

lhs_offset += K0 * sizeof(DATA_TYPE);

2229

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2230

}

2231

2232

// Left-over accumulations

2233

for(; i < K; ++i)

2234

{

2235

// Load values from LHS matrix

2236

VEC_DATA_TYPE(DATA_TYPE, 2)

2237

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2238

#if M0 > 1

2239

VEC_DATA_TYPE(DATA_TYPE, 2)

2240

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2241

#endif // M0 > 1

2242

#if M0 > 2

2243

VEC_DATA_TYPE(DATA_TYPE, 2)

2244

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2245

#endif // M0 > 2

2246

#if M0 > 3

2247

VEC_DATA_TYPE(DATA_TYPE, 2)

2248

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2249

#endif // M0 > 3

2250

#if M0 > 4

2251

VEC_DATA_TYPE(DATA_TYPE, 2)

2252

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2253

#endif // M0 > 4

2254

#if M0 > 5

2255

VEC_DATA_TYPE(DATA_TYPE, 2)

2256

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2257

#endif // M0 > 5

2258

#if M0 > 6

2259

VEC_DATA_TYPE(DATA_TYPE, 2)

2260

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2261

#endif // M0 > 6

2262

#if M0 > 7

2263

VEC_DATA_TYPE(DATA_TYPE, 2)

2264

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2265

#endif // M0 > 7

2266

2267

VEC_DATA_TYPE(DATA_TYPE, N0)

2268

b0;

2269

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2270

2271

VFMA_M0xN0(0, a, b0, c);

2272

2273

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2278

2279

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2280

2281

#if defined(REINTERPRET_OUTPUT_AS_3D)

2282

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2283

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2284

2285

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2286

// multiply dst_stride_z by DEPTH_GEMM3D

2287

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2288

2289

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2290

2291

// Add offset for batched GEMM

2292

dst_addr += z * dst_stride_z;

2293

2294

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2295

2296

// Multiply by the weight of matrix-matrix product and store the result

2297

#if defined(ALPHA)

2298

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2299

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2304

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2305

2306

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2307

2308

#ifndef UNIT_BETA

2309

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2310

#endif // UNIT_BIAS

2311

2312

// c = c + bias[broadcasted]

2313

ADD_BLOCK_BROADCAST(M0, c, bias0);

2314

2315

#else // defined(BROADCAST_BIAS)

2316

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2317

2) * bias_stride_z;

2318

2319

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2320

2321

#ifndef UNIT_BETA

2322

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2327

2328

#endif // defined(BROADCAST_BIAS)

2329

#endif // defined(BETA)

2330

2331

#if defined(ACTIVATION_TYPE)

2332

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2333

#endif // defined(ACTIVATION_TYPE)

2334

2335

// Store output block

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2336

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2337

2338

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2343

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2344

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2345

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2346

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2347

#if defined(MIXED_PRECISION)

2348

#if K0 == 2

2349

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2355

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2362

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2370

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2382

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2402

#error "K0 value not supported"

2403

#endif // K0 conditions

2404

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2405

#if K0 == 2

2406

#define ARM_DOT_K0(a, b, c) \

2407

({ \

2408

c = fma(a.s0, b.s0, c); \

2409

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2410

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2411

#elif K0 == 3 // K0 == 3

2412

#define ARM_DOT_K0(a, b, c) \

2413

({ \

2414

c = fma(a.s0, b.s0, c); \

2415

c = fma(a.s1, b.s1, c); \

2416

c = fma(a.s2, b.s2, c); \

2417

})

2418

#elif K0 == 4 // K0 == 4

2419

#define ARM_DOT_K0(a, b, c) \

2420

({ \

2421

c = fma(a.s0, b.s0, c); \

2422

c = fma(a.s1, b.s1, c); \

2423

c = fma(a.s2, b.s2, c); \

2424

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2425

})

2426

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2427

#define ARM_DOT_K0(a, b, c) \

2428

({ \

2429

c = fma(a.s0, b.s0, c); \

2430

c = fma(a.s1, b.s1, c); \

2431

c = fma(a.s2, b.s2, c); \

2432

c = fma(a.s3, b.s3, c); \

2433

c = fma(a.s4, b.s4, c); \

2434

c = fma(a.s5, b.s5, c); \

2435

c = fma(a.s6, b.s6, c); \

2436

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2437

})

2438

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2439

#define ARM_DOT_K0(a, b, c) \

2440

({ \

2441

c = fma(a.s0, b.s0, c); \

2442

c = fma(a.s1, b.s1, c); \

2443

c = fma(a.s2, b.s2, c); \

2444

c = fma(a.s3, b.s3, c); \

2445

c = fma(a.s4, b.s4, c); \

2446

c = fma(a.s5, b.s5, c); \

2447

c = fma(a.s6, b.s6, c); \

2448

c = fma(a.s7, b.s7, c); \

2449

c = fma(a.s8, b.s8, c); \

2450

c = fma(a.s9, b.s9, c); \

2451

c = fma(a.sA, b.sA, c); \

2452

c = fma(a.sB, b.sB, c); \

2453

c = fma(a.sC, b.sC, c); \

2454

c = fma(a.sD, b.sD, c); \

2455

c = fma(a.sE, b.sE, c); \

2456

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2457

})

2458

#else // K0 not supported

2459

#error "K0 value not supported"

2460

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2461

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2462

2463

#if N0 == 2

2464

#define ARM_DOT_K0XN0(a, b, c) \

2465

({ \

2466

ARM_DOT_K0((a), (b##0), (c.s0)); \

2467

ARM_DOT_K0((a), (b##1), (c.s1)); \

2468

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2469

#elif N0 == 3 // N0 == 3

2470

#define ARM_DOT_K0XN0(a, b, c) \

2471

({ \

2472

ARM_DOT_K0((a), (b##0), (c.s0)); \

2473

ARM_DOT_K0((a), (b##1), (c.s1)); \

2474

ARM_DOT_K0((a), (b##2), (c.s2)); \

2475

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2476

#elif N0 == 4 // N0 == 4

2477

#define ARM_DOT_K0XN0(a, b, c) \

2478

({ \

2479

ARM_DOT_K0((a), (b##0), (c.s0)); \

2480

ARM_DOT_K0((a), (b##1), (c.s1)); \

2481

ARM_DOT_K0((a), (b##2), (c.s2)); \

2482

ARM_DOT_K0((a), (b##3), (c.s3)); \

2483

})

2484

#elif N0 == 8 // N0 == 8

2485

#define ARM_DOT_K0XN0(a, b, c) \

2486

({ \

2487

ARM_DOT_K0((a), (b##0), (c.s0)); \

2488

ARM_DOT_K0((a), (b##1), (c.s1)); \

2489

ARM_DOT_K0((a), (b##2), (c.s2)); \

2490

ARM_DOT_K0((a), (b##3), (c.s3)); \

2491

ARM_DOT_K0((a), (b##4), (c.s4)); \

2492

ARM_DOT_K0((a), (b##5), (c.s5)); \

2493

ARM_DOT_K0((a), (b##6), (c.s6)); \

2494

ARM_DOT_K0((a), (b##7), (c.s7)); \

2495

})

2496

#elif N0 == 16 // N0 == 16

2497

#define ARM_DOT_K0XN0(a, b, c) \

2498

({ \

2499

ARM_DOT_K0((a), (b##0), (c.s0)); \

2500

ARM_DOT_K0((a), (b##1), (c.s1)); \

2501

ARM_DOT_K0((a), (b##2), (c.s2)); \

2502

ARM_DOT_K0((a), (b##3), (c.s3)); \

2503

ARM_DOT_K0((a), (b##4), (c.s4)); \

2504

ARM_DOT_K0((a), (b##5), (c.s5)); \

2505

ARM_DOT_K0((a), (b##6), (c.s6)); \

2506

ARM_DOT_K0((a), (b##7), (c.s7)); \

2507

ARM_DOT_K0((a), (b##8), (c.s8)); \

2508

ARM_DOT_K0((a), (b##9), (c.s9)); \

2509

ARM_DOT_K0((a), (b##A), (c.sA)); \

2510

ARM_DOT_K0((a), (b##B), (c.sB)); \

2511

ARM_DOT_K0((a), (b##C), (c.sC)); \

2512

ARM_DOT_K0((a), (b##D), (c.sD)); \

2513

ARM_DOT_K0((a), (b##E), (c.sE)); \

2514

ARM_DOT_K0((a), (b##F), (c.sF)); \

2515

})

2516

#else // N0 not supported

2517

#error "N0 value not supported"

2518

#endif // N0 conditions

2519

2520

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2521

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2522

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2523

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2524

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2525

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2526

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2527

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2528

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2529

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2530

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2531

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2532

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2533

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2534

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2535

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2536

* - N0 = 2, 3, 4, 8, 16

2537

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2538

* - V0 >= 1

2539

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2540

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2541

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2542

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2543

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2544

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2545

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2546

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2547

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2548

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2549

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2550

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2551

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2552

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2553

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2554

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2555

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2556

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2557

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2558

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2559

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2560

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2561

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2562

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2563

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2564

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2565

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2566

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2567

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2568

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2569

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2570

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2571

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2572

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2573

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2574

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2575

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2576

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2577

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2578

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2579

*/

2580

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2581

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2582

#if defined(BETA)

2583

IMAGE_DECLARATION(bias),

2584

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2585

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2586

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2587

uint lhs_stride_z,

2588

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2589

#if defined(BETA)

2590

uint bias_stride_z,

2591

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2592

uint dst_stride_z

2593

#if defined(REINTERPRET_OUTPUT_AS_3D)

2594

,

2595

uint dst_cross_plane_pad

2596

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2601

2602

#if defined(LHS_INTERLEAVE)

2603

#define LHS_OFFSET_X (K0)

2604

#define LHS_STEP_X ((K0) * (V0))

2605

#define LHS_STEP_LOOP (1)

2606

#else // defined(INTERLEAVE)

2607

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2608

#define LHS_STEP_X (K0)

2609

#define LHS_STEP_LOOP (V0)

2610

#endif // defined(INTERLEAVE)

2611

2612

// Block size

2613

#define RHS_BLOCK_SIZE ((K0) * (N0))

2614

2615

// RHS offset and step X

2616

#if defined(RHS_INTERLEAVE)

2617

#define RHS_OFFSET_X (K0)

2618

#define RHS_STEP_X ((K0) * (H0))

2619

#define RHS_STEP_LOOP (1)

2620

#else // defined(RHS_INTERLEAVE)

2621

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2622

#define RHS_STEP_X (K0)

2623

#define RHS_STEP_LOOP (H0)

2624

#endif // defined(RHS_INTERLEAVE)

2625

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2626

#if defined(DUMMY_WORK_ITEMS)

2627

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2632

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2633

// Compute LHS matrix address

2634

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2635

(get_global_id(2) * lhs_stride_z);

2636

2637

// Compute RHS matrix address

2638

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2639

2640

#if defined(MATRIX_B_DEPTH)

2641

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2642

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2643

#else // defined(MATRIX_B_DEPTH)

2644

rhs_addr += get_global_id(2) * rhs_stride_z;

2645

#endif // defined(MATRIX_B_DEPTH)

2646

2647

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2648

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2649

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2650

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2651

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2652

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2653

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2654

{

2655

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2656

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2657

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2658

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2659

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2660

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2661

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2662

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2663

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2664

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2665

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2666

2667

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2668

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2669

2670

// Accumulate

2671

ARM_DOT_K0XN0(a0, b, c0);

2672

#if M0 > 1

2673

ARM_DOT_K0XN0(a1, b, c1);

2674

#endif // M0 > 1

2675

#if M0 > 2

2676

ARM_DOT_K0XN0(a2, b, c2);

2677

#endif // M0 > 2

2678

#if M0 > 3

2679

ARM_DOT_K0XN0(a3, b, c3);

2680

#endif // M0 > 3

2681

#if M0 > 4

2682

ARM_DOT_K0XN0(a4, b, c4);

2683

#endif // M0 > 4

2684

#if M0 > 5

2685

ARM_DOT_K0XN0(a5, b, c5);

2686

#endif // M0 > 5

2687

#if M0 > 6

2688

ARM_DOT_K0XN0(a6, b, c6);

2689

#endif // M0 > 6

2690

#if M0 > 7

2691

ARM_DOT_K0XN0(a7, b, c7);

2692

#endif // M0 > 7

2693

2694

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2695

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2696

}

2697

2698

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2699

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2700

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2701

2702

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2703

2704

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2705

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2706

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2707

// multiply dst_stride_z by DEPTH_GEMM3D

2708

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2709

2710

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2711

2712

// Add offset for batched GEMM

2713

dst_addr += get_global_id(2) * dst_stride_z;

2714

2715

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2716

2717

// Multiply by the weight of matrix-matrix product and store the result

2718

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2719

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2720

#endif // defined(ALPHA)

2721

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2722

// Add beta*bias

2723

#if defined(BETA)

2724

#if defined(BROADCAST_BIAS)

2725

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2726

2727

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2728

2729

#ifndef UNIT_BETA

2730

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2731

#endif // UNIT_BIAS

2732

2733

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2734

#if defined(MIXED_PRECISION)

2735

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2736

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2737

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2738

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2739

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2740

2741

#else // defined(BROADCAST_BIAS)

2742

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2743

2) * bias_stride_z;

2744

2745

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2746

2747

#ifndef UNIT_BETA

2748

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2749

#endif // UNIT_BIAS

2750

2751

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2752

#if defined(MIXED_PRECISION)

2753

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2754

ADD_BLOCK(M0, c, bias_hp);

2755

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2756

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2757

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2758

2759

#endif // defined(BROADCAST_BIAS)

2760

#endif // defined(BETA)

2761

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2762

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2763

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2764

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2765

#else // defined(MIXED_PRECISION)

2766

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2767

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2768

#endif // defined(ACTIVATION_TYPE)

2769

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2770

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2771

#if defined(MIXED_PRECISION)

2772

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2773

#else // defined(MIXED_PRECISION)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2774

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2775

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2776

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2777

#undef LHS_BLOCK_SIZE

2778

#undef LHS_OFFSET_X

2779

#undef LHS_STEP_X

2780

#undef RHS_BLOCK_SIZE

2781

#undef RHS_OFFSET_X

2782

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2783

#undef LHS_STEP_LOOP

2784

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2785

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2786

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2787

#if defined(OPENCL_IMAGE_SUPPORT)

2788

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2789

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2790

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2791

*

2792

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2793

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2794

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2795

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2796

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2797

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2798

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2799

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2800

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2801

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2802

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2803

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2804

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2805

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2806

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2807

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2814

* The activation function is performed after the bias addition

2815

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2816

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2817

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2818

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2819

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2820

*

2821

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2822

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2823

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2824

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2825

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2826

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2827

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2828

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2829

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2830

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2831

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2832

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2833

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2834

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2835

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2836

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2837

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2838

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2839

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2840

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2841

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2842

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2843

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2844

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2845

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2846

*/

2847

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2848

__read_only image2d_t rhs_img,

2849

#if defined(BETA)

2850

IMAGE_DECLARATION(bias),

2851

#endif // defined(BETA)

2852

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2853

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2859

uint dst_stride_z

2860

#if defined(REINTERPRET_OUTPUT_AS_3D)

2861

,

2862

uint dst_cross_plane_pad

2863

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2868

2869

// Block size

2870

#define LHS_BLOCK_SIZE ((K0) * (M0))

2871

2872

#if defined(LHS_INTERLEAVE)

2873

#define LHS_OFFSET_X (K0)

2874

#define LHS_STEP_X ((K0) * (V0))

2875

#define LHS_STEP_LOOP (1)

2876

#else // defined(INTERLEAVE)

2877

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2878

#define LHS_STEP_X (K0)

2879

#define LHS_STEP_LOOP (V0)

2880

#endif // defined(INTERLEAVE)

2881

2882

// Block size

2883

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2884

2885

// RHS offset and step X

2886

#if defined(RHS_INTERLEAVE)

2887

#define RHS_OFFSET_X (PIXEL_UNIT)

2888

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2889

#define RHS_STEP_LOOP (1)

2890

#else // defined(RHS_INTERLEAVE)

2891

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2892

#define RHS_STEP_X PIXEL_UNIT

2893

#define RHS_STEP_LOOP (H0)

2894

#endif // defined(RHS_INTERLEAVE)

2895

2896

#if defined(DUMMY_WORK_ITEMS)

2897

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2902

2903

// Compute LHS matrix address

2904

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2905

(get_global_id(2) * lhs_stride_z);

2906

2907

#if defined(MATRIX_B_DEPTH)

2908

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2909

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2910

#else // defined(MATRIX_B_DEPTH)

2911

const uint z_rhs = get_global_id(2);

2912

#endif // defined(MATRIX_B_DEPTH)

2913

2914

// Compute RHS matrix coordinates

2915

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2916

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2917

2918

// Initialize the accumulators

2919

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2920

2921

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2922

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2923

2924

for(int i = 0; i < K; i += K0)

2925

{

2926

// Load values from LHS matrix

2927

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2928

2929

// Load values from RHS matrix stored in a cl_image

2930

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2931

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2932

2933

// Accumulate

2934

ARM_DOT_K0XN0(a0, b, c0);

2935

#if M0 > 1

2936

ARM_DOT_K0XN0(a1, b, c1);

2937

#endif // M0 > 1

2938

#if M0 > 2

2939

ARM_DOT_K0XN0(a2, b, c2);

2940

#endif // M0 > 2

2941

#if M0 > 3

2942

ARM_DOT_K0XN0(a3, b, c3);

2943

#endif // M0 > 3

2944

#if M0 > 4

2945

ARM_DOT_K0XN0(a4, b, c4);

2946

#endif // M0 > 4

2947

#if M0 > 5

2948

ARM_DOT_K0XN0(a5, b, c5);

2949

#endif // M0 > 5

2950

#if M0 > 6

2951

ARM_DOT_K0XN0(a6, b, c6);

2952

#endif // M0 > 6

2953

#if M0 > 7

2954

ARM_DOT_K0XN0(a7, b, c7);

2955

#endif // M0 > 7

2956

2957

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2958

2959

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2960

}

2961

2962

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2963

2964

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2965

2966

#if defined(REINTERPRET_OUTPUT_AS_3D)

2967

2968

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2969

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2970

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2971

// multiply dst_stride_z by DEPTH_GEMM3D

2972

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2973

2974

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2975

2976

// Add offset for batched GEMM

2977

dst_addr += get_global_id(2) * dst_stride_z;

2978

2979

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2980

2981

// Multiply by the weight of matrix-matrix product and store the result

2982

#if defined(ALPHA)

2983

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2984

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2989

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2990

2991

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2992

2993

#ifndef UNIT_BETA

2994

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2995

#endif // UNIT_BIAS

2996

2997

// c = c + bias[broadcasted]

2998

#if defined(MIXED_PRECISION)

2999

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3000

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3001

#else // defined(MIXED_PRECISION)

3002

ADD_BLOCK_BROADCAST(M0, c, bias0);

3003

#endif // defined(MIXED_PRECISION)

3004

3005

#else // defined(BROADCAST_BIAS)

3006

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3007

2) * bias_stride_z;

3008

3009

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3010

3011

#ifndef UNIT_BETA

3012

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3017

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3018

ADD_BLOCK(M0, c, bias_hp);

3019

#else // defined(MIXED_PRECISION)

3020

ADD_BLOCK(M0, c, bias);

3021

#endif // defined(MIXED_PRECISION)

3022

3023

#endif // defined(BROADCAST_BIAS)

3024

#endif // defined(BETA)

3025

3026

#if defined(ACTIVATION_TYPE)

3027

#if defined(MIXED_PRECISION)

3028

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3029

#else // defined(MIXED_PRECISION)

3030

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3031

#endif // defined(MIXED_PRECISION)

3032

#endif // defined(ACTIVATION_TYPE)

3033

3034

// Store output block

3035

#if defined(MIXED_PRECISION)

3036

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3037

#else // defined(MIXED_PRECISION)

3038

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3039

#endif // defined(MIXED_PRECISION)

3040

3041

#undef LHS_BLOCK_SIZE

3042

#undef LHS_OFFSET_X

3043

#undef LHS_STEP_X

3044

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3052

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3053

#if defined(LHS_TRANSPOSE)

3054

3055

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3056

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3057

#if defined(MIXED_PRECISION)

3058

3059

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3060

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3061

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3062

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3063

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3064

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3065

#else // defined(MIXED_PRECISION

3066

3067

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3068

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3069

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3070

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3071

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3072

3073

#endif // defined(MIXED_PRECISION)

3074

3075

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3076

({ \

3077

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3078

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3079

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3080

({ \

3081

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3082

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3083

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3084

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3085

({ \

3086

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3087

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3088

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3089

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3090

({ \

3091

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3092

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3093

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3094

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3095

({ \

3096

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3097

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3098

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3099

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3100

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3101

})

3102

3103

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3104

// a is the column-vector (transposed)

3105

// b is the row-vector (not transposed)

3106

// C is the output matrix

3107

// Lower case is a vector (a, b)

3108

// Upper case is a matrix (C)

3109

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3110

3111

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3112

({ \

3113

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3114

})

3115

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3116

({ \

3117

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3118

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3119

})

3120

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3121

({ \

3122

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3123

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3124

})

3125

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3126

({ \

3127

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3128

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3129

})

3130

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3131

({ \

3132

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3133

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3134

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3135

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3136

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3137

})

3138

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3139

({ \

3140

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3141

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3142

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3143

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3144

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3145

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3146

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3147

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3148

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3149

})

3150

3151

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3152

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3153

// The dimensions supported are:

3154

// M0: 1, 2, 3, 4, 8

3155

// N0: 1, 2, 3, 4, 8, 16

3156

// K0: 1, 2, 3, 4, 8, 16

3157

// This macro calls the vector-by-matrix macro K0 times

3158

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3159

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3160

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3161

(M0, N0, TYPE, A, B, C)

3162

3163

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3164

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3165

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3166

*

3167

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3168

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3169

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3170

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3171

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3172

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3173

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3174

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3175

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3176

* - M0 = 2, 3, 4, 8

3177

* - N0 = 2, 3, 4, 8, 16

3178

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3183

* The activation function is performed after the bias addition

3184

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3185

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3186

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3187

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3188

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3189

*

3190

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3191

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3192

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3193

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3194

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3195

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3196

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3197

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3198

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3199

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3200

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3201

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3202

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3203

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3204

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3205

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3206

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3207

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3208

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3209

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3210

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3211

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3212

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3213

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3214

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3215

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3216

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3217

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3218

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3219

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3220

*/

3221

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3222

IMAGE_DECLARATION(rhs),

3223

#if defined(BETA)

3224

IMAGE_DECLARATION(bias),

3225

#endif // defined(BETA)

3226

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3227

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3233

uint dst_stride_z

3234

#if defined(REINTERPRET_OUTPUT_AS_3D)

3235

,

3236

uint dst_cross_plane_pad

3237

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3242

3243

#if defined(LHS_INTERLEAVE)

3244

#define LHS_OFFSET_X (M0)

3245

#define LHS_STEP_X ((M0) * (V0))

3246

#define LHS_STEP_LOOP (1)

3247

#else // defined(INTERLEAVE)

3248

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3249

#define LHS_STEP_X (M0)

3250

#define LHS_STEP_LOOP (V0)

3251

#endif // defined(INTERLEAVE)

3252

3253

// Block size

3254

#define RHS_BLOCK_SIZE ((K0) * (N0))

3255

3256

// RHS offset and step X

3257

#if defined(RHS_INTERLEAVE)

3258

#define RHS_OFFSET_X (N0)

3259

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3260

#else // defined(RHS_INTERLEAVE)

3261

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3262

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3263

#endif // defined(RHS_INTERLEAVE)

3264

3265

const uint x = get_global_id(0);

3266

const uint y = get_global_id(1);

3267

const uint z = get_global_id(2);

3268

3269

#if defined(DUMMY_WORK_ITEMS)

3270

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3275

3276

// Compute LHS matrix address

3277

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3278

3279

// Compute RHS matrix address

3280

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3281

3282

#if defined(MATRIX_B_DEPTH)

3283

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3284

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3285

#else // defined(MATRIX_B_DEPTH)

3286

rhs_addr += z * rhs_stride_z;

3287

#endif // defined(MATRIX_B_DEPTH)

3288

3289

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3290

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3291

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3292

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3293

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3294

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3295

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3296

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3297

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3298

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3299

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3300

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3301

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3302

b0;

3303

3304

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3305

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3306

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3307

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3308

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3309

lhs += LHS_STEP_X;

3310

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3311

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3312

#if K0 > 1

3313

a0 = VLOAD(M0)(0, lhs);

3314

b0 = VLOAD(N0)(0, rhs);

3315

3316

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3324

b0 = VLOAD(N0)(0, rhs);

3325

3326

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3334

b0 = VLOAD(N0)(0, rhs);

3335

3336

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3344

b0 = VLOAD(N0)(0, rhs);

3345

3346

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3352

b0 = VLOAD(N0)(0, rhs);

3353

3354

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3360

b0 = VLOAD(N0)(0, rhs);

3361

3362

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3368

b0 = VLOAD(N0)(0, rhs);

3369

3370

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3378

b0 = VLOAD(N0)(0, rhs);

3379

3380

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3386

b0 = VLOAD(N0)(0, rhs);

3387

3388

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3394

b0 = VLOAD(N0)(0, rhs);

3395

3396

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3402

b0 = VLOAD(N0)(0, rhs);

3403

3404

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3410

b0 = VLOAD(N0)(0, rhs);

3411

3412

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3418

b0 = VLOAD(N0)(0, rhs);

3419

3420

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3426

b0 = VLOAD(N0)(0, rhs);

3427

3428

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3434

b0 = VLOAD(N0)(0, rhs);

3435

3436

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3443

lhs += (M0 * K0 * (V0 - 1));

3444

#endif // LHS_INTERLEAVE

3445

3446

#ifndef RHS_INTERLEAVE

3447

rhs += (N0 * K0 * (H0 - 1));

3448

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3449

}

3450

3451

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3452

3453

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3454

3455

#if defined(REINTERPRET_OUTPUT_AS_3D)

3456

3457

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3458

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3459

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3460

// multiply dst_stride_z by DEPTH_GEMM3D

3461

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3462

3463

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3464

3465

// Add offset for batched GEMM

3466

dst_addr += z * dst_stride_z;

3467

3468

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3469

3470

// Multiply by the weight of matrix-matrix product and store the result

3471

#if defined(ALPHA)

3472

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3473

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3478

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3479

3480

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3481

3482

#ifndef UNIT_BETA

3483

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3484

#endif // UNIT_BIAS

3485

3486

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3487

#if defined(MIXED_PRECISION)

3488

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3489

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3490

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3491

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3492

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3493

3494

#else // defined(BROADCAST_BIAS)

3495

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3496

3497

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3498

3499

#ifndef UNIT_BETA

3500

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3501

#endif // UNIT_BIAS

3502

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3503

#if defined(MIXED_PRECISION)

3504

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3505

ADD_BLOCK(M0, c, bias_hp);

3506

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3507

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3508

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3509

3510

#endif // defined(BROADCAST_BIAS)

3511

#endif // defined(BETA)

3512

3513

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3514

#if defined(MIXED_PRECISION)

3515

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3516

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3517

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3518

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3519

#endif // defined(ACTIVATION_TYPE)

3520

3521

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3522

#if defined(MIXED_PRECISION)

3523

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3524

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3525

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3526

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3527

3528

#undef LHS_BLOCK_SIZE

3529

#undef LHS_OFFSET_X

3530

#undef LHS_STEP_X

3531

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3536

#if defined(OPENCL_IMAGE_SUPPORT)

3537

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3538

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3539

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3540

*

3541

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3542

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3543

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3544

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3545

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3546

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3547

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3548

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3549

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3550

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3551

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3552

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3553

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3561

* The activation function is performed after the bias addition

3562

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3563

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3564

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3565

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3566

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3567

*

3568

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3569

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3570

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3571

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3572

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3573

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3574

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3575

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3576

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3577

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3578

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3579

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3580

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3581

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3582

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3583

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3584

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3585

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3586

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3587

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3588

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3589

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3590

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3591

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3592

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3593

*/

3594

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3595

__read_only image2d_t rhs_img,

3596

#if defined(BETA)

3597

IMAGE_DECLARATION(bias),

3598

#endif // defined(BETA)

3599

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3600

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3606

uint dst_stride_z

3607

#if defined(REINTERPRET_OUTPUT_AS_3D)

3608

,

3609

uint dst_cross_plane_pad

3610

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3615

3616

// Block size

3617

#define LHS_BLOCK_SIZE ((K0) * (M0))

3618

3619

#if defined(LHS_INTERLEAVE)

3620

#define LHS_OFFSET_X (M0)

3621

#define LHS_STEP_X ((M0) * (V0))

3622

#define LHS_STEP_LOOP (1)

3623

#else // defined(INTERLEAVE)

3624

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3625

#define LHS_STEP_X (M0)

3626

#define LHS_STEP_LOOP (V0)

3627

#endif // defined(INTERLEAVE)

3628

3629

// Block size

3630

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3631

3632

// RHS offset and step X

3633

#if defined(RHS_INTERLEAVE)

3634

#define RHS_OFFSET_X (PIXEL_UNIT)

3635

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3636

#else // defined(RHS_INTERLEAVE)

3637

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3638

#define RHS_STEP_X (PIXEL_UNIT)

3639

#endif // defined(RHS_INTERLEAVE)

3640

3641

const uint x = get_global_id(0);

3642

const uint y = get_global_id(1);

3643

const uint z = get_global_id(2);

3644

3645

#if defined(DUMMY_WORK_ITEMS)

3646

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3651

3652

// Compute LHS matrix address

3653

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3654

3655

#if defined(MATRIX_B_DEPTH)

3656

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3657

const uint z_rhs = (z % MATRIX_B_DEPTH);

3658

#else // defined(MATRIX_B_DEPTH)

3659

const uint z_rhs = z;

3660

#endif // defined(MATRIX_B_DEPTH)

3661

3662

// Compute RHS matrix coordinates

3663

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3664

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3665

3666

// Initialize the accumulators

3667

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3668

3669

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3670

3671

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3672

3673

for(int i = 0; i < K; i += K0)

3674

{

3675

VEC_DATA_TYPE(DATA_TYPE, M0)

3676

a0;

3677

VEC_DATA_TYPE(DATA_TYPE, N0)

3678

b0;

3679

3680

a0 = VLOAD(M0)(0, lhs);

3681

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3682

3683

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3689

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3690

3691

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3698

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3699

3700

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3707

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3708

3709

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3716

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3717

3718

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3723

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3724

3725

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3730

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3731

3732

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3737

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3738

3739

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3746

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3747

3748

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3753

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3754

3755

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3760

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3761

3762

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3767

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3768

3769

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3774

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3775

3776

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3781

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3782

3783

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3788

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3789

3790

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3795

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3796

3797

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3803

lhs += (M0 * K0 * (V0 - 1));

3804

#endif // LHS_INTERLEAVE

3805

3806

x_rhs += K0 * RHS_STEP_X;

3807

#ifndef RHS_INTERLEAVE

3808

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3809

#endif // RHS_INTERLEAVE

3810

}

3811

3812

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3813

3814

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3815

3816

#if defined(REINTERPRET_OUTPUT_AS_3D)

3817

3818

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3819

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3820

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3821

// multiply dst_stride_z by DEPTH_GEMM3D

3822

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3823

3824

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3825

3826

// Add offset for batched GEMM

3827

dst_addr += z * dst_stride_z;

3828

3829

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3830

3831

// Multiply by the weight of matrix-matrix product and store the result

3832

#if defined(ALPHA)

3833

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3834

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3839

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3840

3841

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3842

3843

#ifndef UNIT_BETA

3844

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3845

#endif // UNIT_BIAS

3846

3847

// c = c + bias[broadcasted]

3848

#if defined(MIXED_PRECISION)

3849

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3850

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3851

#else // defined(MIXED_PRECISION)

3852

ADD_BLOCK_BROADCAST(M0, c, bias0);

3853

#endif // defined(MIXED_PRECISION)

3854

3855

#else // defined(BROADCAST_BIAS)

3856

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3857

3858

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3859

3860

#ifndef UNIT_BETA

3861

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3862

#endif // UNIT_BIAS

3863

3864

#if defined(MIXED_PRECISION)

3865

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3866

ADD_BLOCK(M0, c, bias_hp);

3867

#else // defined(MIXED_PRECISION)

3868

ADD_BLOCK(M0, c, bias);

3869

#endif // defined(MIXED_PRECISION)

3870

3871

#endif // defined(BROADCAST_BIAS)

3872

#endif // defined(BETA)

3873

3874

#if defined(ACTIVATION_TYPE)

3875

#if defined(MIXED_PRECISION)

3876

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3877

#else // defined(MIXED_PRECISION)

3878

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3879

#endif // defined(MIXED_PRECISION)

3880

#endif // defined(ACTIVATION_TYPE)

3881

3882

// Store output block

3883

#if defined(MIXED_PRECISION)

3884

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3885

#else // defined(MIXED_PRECISION)

3886

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3887

#endif // defined(MIXED_PRECISION)

3888

3889

#undef LHS_BLOCK_SIZE

3890

#undef LHS_OFFSET_X

3891

#undef LHS_STEP_X

3892

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3900

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3901

#endif // defined(LHS_TRANSPOSE)

3902

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3903

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3904

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3905

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3906

3907

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3914

({ \

3915

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3916

})

3917

#elif M0 == 2 // M0 == 2

3918

#define RHS_VFMA_M0xN0(i, a, b, c) \

3919

({ \

3920

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3921

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3922

})

3923

#elif M0 == 3 // M0 == 3

3924

#define RHS_VFMA_M0xN0(i, a, b, c) \

3925

({ \

3926

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3927

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3928

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3929

})

3930

#elif M0 == 4 // M0 == 4

3931

#define RHS_VFMA_M0xN0(i, a, b, c) \

3932

({ \

3933

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3934

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3935

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3936

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3937

})

3938

#elif M0 == 5 // M0 == 5

3939

#define RHS_VFMA_M0xN0(i, a, b, c) \

3940

({ \

3941

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3942

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3943

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3944

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3945

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3946

})

3947

#elif M0 == 6 // M0 == 6

3948

#define RHS_VFMA_M0xN0(i, a, b, c) \

3949

({ \

3950

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3951

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3952

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3953

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3954

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3955

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3956

})

3957

#elif M0 == 7 // M0 == 7

3958

#define RHS_VFMA_M0xN0(i, a, b, c) \

3959

({ \

3960

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3961

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3962

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3963

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3964

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3965

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3966

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3967

})

3968

#elif M0 == 8 // M0 == 8

3969

#define RHS_VFMA_M0xN0(i, a, b, c) \

3970

({ \

3971

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3972

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3973

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3974

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3975

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3976

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3977

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3978

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

3979

})

3980

#else // M0 not supported

3981

#error "M0 not supported"

3982

#endif // M0 not supported

3983

3984

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3985

* The LHS matrix is NOT reshaped

3986

* The RHS matrix is NOT reshaped

3987

*

3988

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3989

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

3990

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

3991

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

3992

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

3993

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

3994

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3995

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3996

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3997

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

3998

* - N0 = 2, 3, 4, 8, 16

3999

* - K0 = 2, 3, 4, 8, 16

4000

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4001

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4002

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4003

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4004

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

4005

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4006

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4007

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4008

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

4009

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4010

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

4011

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4012

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4013

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4014

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4015

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4016

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4017

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4018

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4019

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4020

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4021

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4022

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4023

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4024

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4025

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4026

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4027

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4028

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4029

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4030

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4031

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4032

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4033

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4034

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4035

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4036

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4037

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4038

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4039

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4040

*/

4041

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4042

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4043

#if defined(BETA)

4044

IMAGE_DECLARATION(bias),

4045

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4046

IMAGE_DECLARATION(dst),

4047

uint lhs_stride_z,

4048

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4049

#if defined(BETA)

4050

uint bias_stride_z,

4051

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4052

uint dst_stride_z

4053

#if defined(REINTERPRET_INPUT_AS_3D)

4054

,

4055

uint lhs_cross_plane_pad

4056

#endif // REINTERPRET_INPUT_AS_3D

4057

#if defined(REINTERPRET_OUTPUT_AS_3D)

4058

,

4059

uint dst_cross_plane_pad

4060

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4065

4066

// RHS offset and step X

4067

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4068

4069

uint x = get_global_id(0);

4070

uint y = get_global_id(1);

4071

uint z = get_global_id(2);

4072

4073

#if defined(DUMMY_WORK_ITEMS)

4074

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4079

4080

// Compute LHS matrix address

4081

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

4082

4083

// Compute RHS matrix address

4084

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4085

4086

#if defined(MATRIX_B_DEPTH)

4087

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4088

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4089

#else // defined(MATRIX_B_DEPTH)

4090

rhs_offset += z * rhs_stride_z;

4091

#endif // defined(MATRIX_B_DEPTH)

4092

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4093

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4094

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4095

4096

#if defined(REINTERPRET_INPUT_AS_3D)

4097

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4098

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

4099

4100

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4101

// multiply lhs_stride_z by DEPTH_GEMM3D

4102

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4103

4104

#else // defined(REINTERPRET_INPUT_AS_3D)

4105

4106

// Add offset for batched GEMM

4107

lhs_offset += z * lhs_stride_z;

4108

4109

#endif // defined(REINTERPRET_INPUT_AS_3D)

4110

4111

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4112

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4113

4114

int i = 0;

4115

for(; i <= (K - K0); i += K0)

4116

{

4117

// Supported cases (M0, K0):

4118

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4119

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4120

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4121

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4122

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4123

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4124

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4125

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4126

// Load values from LHS matrix

4127

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4128

4129

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4130

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4131

4132

RHS_VFMA_M0xN0(0, a, b0, c);

4133

RHS_VFMA_M0xN0(1, a, b1, c);

4134

#if K0 > 2

4135

RHS_VFMA_M0xN0(2, a, b2, c);

4136

#endif // K0 > 2

4137

#if K0 > 3

4138

RHS_VFMA_M0xN0(3, a, b3, c);

4139

#endif // K0 > 3

4140

#if K0 > 4

4141

RHS_VFMA_M0xN0(4, a, b4, c);

4142

RHS_VFMA_M0xN0(5, a, b5, c);

4143

RHS_VFMA_M0xN0(6, a, b6, c);

4144

RHS_VFMA_M0xN0(7, a, b7, c);

4145

#endif // K0 > 4

4146

#if K0 > 8

4147

RHS_VFMA_M0xN0(8, a, b8, c);

4148

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4149

RHS_VFMA_M0xN0(A, a, bA, c);

4150

RHS_VFMA_M0xN0(B, a, bB, c);

4151

RHS_VFMA_M0xN0(C, a, bC, c);

4152

RHS_VFMA_M0xN0(D, a, bD, c);

4153

RHS_VFMA_M0xN0(E, a, bE, c);

4154

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4155

#endif // K0 > 8

4156

4157

lhs_offset += K0 * sizeof(DATA_TYPE);

4158

rhs_offset += K0 * rhs_stride_y;

4159

}

4160

4161

// Left-over accumulations

4162

for(; i < K; ++i)

4163

{

4164

// Load values from LHS matrix

4165

VEC_DATA_TYPE(DATA_TYPE, 2)

4166

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4167

#if M0 > 1

4168

VEC_DATA_TYPE(DATA_TYPE, 2)

4169

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4170

#endif // M0 > 1

4171

#if M0 > 2

4172

VEC_DATA_TYPE(DATA_TYPE, 2)

4173

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4174

#endif // M0 > 2

4175

#if M0 > 3

4176

VEC_DATA_TYPE(DATA_TYPE, 2)

4177

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4178

#endif // M0 > 3

4179

#if M0 > 4

4180

VEC_DATA_TYPE(DATA_TYPE, 2)

4181

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4182

#endif // M0 > 4

4183

#if M0 > 5

4184

VEC_DATA_TYPE(DATA_TYPE, 2)

4185

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4186

#endif // M0 > 5

4187

#if M0 > 6

4188

VEC_DATA_TYPE(DATA_TYPE, 2)

4189

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4190

#endif // M0 > 6

4191

#if M0 > 7

4192

VEC_DATA_TYPE(DATA_TYPE, 2)

4193

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4194

#endif // M0 > 7

4195

4196

VEC_DATA_TYPE(DATA_TYPE, N0)

4197

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4198

RHS_VFMA_M0xN0(0, a, b, c);

4199

4200

lhs_offset += sizeof(DATA_TYPE);

4201

rhs_offset += rhs_stride_y;

4202

}

4203

4204

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

4205

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4206

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4207

4208

#if defined(REINTERPRET_OUTPUT_AS_3D)

4209

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4210

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

4211

4212

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4213

// multiply dst_stride_z by DEPTH_GEMM3D

4214

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4215

4216

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4217

4218

// Add offset for batched GEMM

4219

dst_addr += z * dst_stride_z;

4220

4221

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4222

4223

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4224

#if defined(ALPHA)

4225

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4226

#endif // defined(ALPHA)

4227

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4228

// Add beta*bias

4229

#if defined(BETA)

4230

#if defined(BROADCAST_BIAS)

4231

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4232

4233

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4234

4235

#ifndef UNIT_BETA

4236

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4237

#endif // UNIT_BIAS

4238

4239

// c = c + bias[broadcasted]

4240

ADD_BLOCK_BROADCAST(M0, c, bias0);

4241

4242

#else // defined(BROADCAST_BIAS)

4243

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

4244

2) * bias_stride_z;

4245

4246

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4247

4248

#ifndef UNIT_BETA

4249

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4254

4255

#endif // defined(BROADCAST_BIAS)

4256

#endif // defined(BETA)

4257

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4258

#if defined(ACTIVATION_TYPE)

4259

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

4260

#endif // defined(ACTIVATION_TYPE)

4261

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4262

// Store output block

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

4263

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, M, N, y, x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4264

4265

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4270

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4271

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4272

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4273

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4274

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4275

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4276

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4277

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4278

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4279

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4280

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4281

* The activation function is performed after the bias addition

4282

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4283

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4284

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4285

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4286

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4287

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4288

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4289

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4290

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4291

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4292

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4293

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4294

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4295

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4296

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4297

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4298

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4299

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4300

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4301

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4302

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4303

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4304

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4305

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4306

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4307

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4308

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4309

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4310

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4311

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4312

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4313

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4314

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4315

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4316

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4317

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4318

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

4319

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4320

#if defined(BETA)

4321

IMAGE_DECLARATION(src2),

4322

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4323

IMAGE_DECLARATION(dst),

4324

uint src0_stride_z,

4325

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4326

#if defined(BETA)

4327

uint src2_stride_z,

4328

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4329

uint dst_stride_z

4330

#if defined(REINTERPRET_OUTPUT_AS_3D)

4331

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4332

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4333

#endif // REINTERPRET_OUTPUT_AS_3D

4334

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4335

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4336

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4337

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4338

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4339

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4340

// Offset

4341

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4342

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4343

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4344

// src_addr_a = address of matrix A

4345

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4346

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4347

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4348

4349

#if defined(MATRIX_B_DEPTH)

4350

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4351

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4352

#else // defined(MATRIX_B_DEPTH)

4353

src1_addr_in_bytes += z * src1_stride_z;

4354

#endif // defined(MATRIX_B_DEPTH)

4355

4356

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4357

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4358

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4359

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4360

__global float *src_end_addr_b = src_addr_b + COLS_B;

4361

4362

src_addr_a += offset_row_a;

4363

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4364

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4365

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4370

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4371

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4372

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4373

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4374

float4 a0 = vload4(0, src_addr_a);

4375

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4376

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4377

c0 += (float4)a0.s0 * b0;

4378

c1 += (float4)a0.s1 * b0;

4379

c2 += (float4)a0.s2 * b0;

4380

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4381

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4382

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4383

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4384

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4385

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4386

c0 += (float4)a0.s0 * b0;

4387

c1 += (float4)a0.s1 * b0;

4388

c2 += (float4)a0.s2 * b0;

4389

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4390

}

4391

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4392

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4393

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4394

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4395

float4 a0 = vload4(0, src_addr_a);

4396

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4397

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4398

c0 += (float4)a0.s0 * b0;

4399

c1 += (float4)a0.s1 * b0;

4400

c2 += (float4)a0.s2 * b0;

4401

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4402

}

4403

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4404

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4405

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4406

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4407

// Compute dst address

4408

__global uchar *dst_addr = offset(&dst, 0, 0);

4409

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4410

uint4 zout = 0;

4411

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4412

#if defined(REINTERPRET_OUTPUT_AS_3D)

4413

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4414

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4415

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4420

// |******************|

4421

// | cross_plane_pad |

4422

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4427

4428

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4429

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4430

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4431

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4432

// Add offset due to the cross plane paddings

4433

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4434

4435

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4436

// multiply dst_stride_z by DEPTH_GEMM3D

4437

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4438

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4439

// Add offset for batched GEMM

4440

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4441

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4442

4443

// Multiply by the weight of matrix-matrix product and store the result

4444

#if defined(ALPHA)

4445

SCALE_BLOCK(4, float, c, ALPHA);

4446

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4451

4452

#if defined(BROADCAST_BIAS)

4453

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4454

4455

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4456

4457

#ifndef UNIT_BETA

4458

SCALE_BLOCK(1, float, bias, BETA);

4459

#endif // UNIT_BIAS

4460

4461

// c = c + bias[broadcasted]

4462

ADD_BLOCK_BROADCAST(4, c, bias0);

4463

4464

#else // defined(BROADCAST_BIAS)

4465

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4466

2) * src2_stride_z;

4467

4468

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4469

4470

#ifndef UNIT_BETA

4471

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4476

4477

#endif // defined(BROADCAST_BIAS)

4478

#endif // defined(BETA)

4479

4480

#if defined(ACTIVATION_TYPE)

4481

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4482

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4483

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4484

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4485

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4486

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4487

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4488

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4489

}

4490

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4491

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4492

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4493

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4494

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4495

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4496

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4497

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4498

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4499

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4500

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4501

* The activation function is performed after the bias addition

4502

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4503

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4504

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4505

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4506

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4507

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4508

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4509

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4510

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4511

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4512

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4513

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4514

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4515

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4516

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4517

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4518

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4519

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4520

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4521

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4522

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4523

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4524

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4525

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4526

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4527

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4528

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4529

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4530

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4531

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4532

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4533

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4534

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4535

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4536

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4537

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4538

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

4539

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4540

#if defined(BETA)

4541

IMAGE_DECLARATION(src2),

4542

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4543

IMAGE_DECLARATION(dst),

4544

uint src0_stride_z,

4545

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4546

#if defined(BETA)

4547

uint src2_stride_z,

4548

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4549

uint dst_stride_z

4550

#if defined(REINTERPRET_OUTPUT_AS_3D)

4551

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4552

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4553

#endif // REINTERPRET_OUTPUT_AS_3D

4554

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4555

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4556

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4557

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4558

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4559

4560

// Offset

4561

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4562

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

4563

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4564

// src_addr_a = address of matrix A

4565

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4566

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4567

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4568

4569

#if defined(MATRIX_B_DEPTH)

4570

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4571

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4572

#else // defined(MATRIX_B_DEPTH)

4573

src1_addr_in_bytes += z * src1_stride_z;

4574

#endif // defined(MATRIX_B_DEPTH)

4575

4576

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4577

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4578

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4579

src_addr_a += offset_row_a;

4580

src_addr_b += offset_row_b;

4581

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4582

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4587

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4588

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

4589

4590

int i = 0;

4591

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4592

{

4593

// Load values from matrix A (interleaved) and matrix B (transposed)

4594

float4 a0 = vload4(0, src_addr_a);

4595

float4 b0 = vload4(0, src_addr_b);

4596

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4597

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4598

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4599

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4600

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4601

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4602

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4603

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4604

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4605

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4606

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4607

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4608

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4609

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4610

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4611

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4612

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4613

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4614

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4615

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4616

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4617

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4618

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4619

4620

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4621

a0 = vload4(0, src_addr_a);

4622

b0 = vload4(0, src_addr_b);

4623

4624

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4625

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4626

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4627

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4628

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4629

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4630

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4631

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4632

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4633

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4634

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4635

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4636

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4637

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4638

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4639

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4640

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4641

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4642

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4643

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4644

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4645

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4646

4647

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4648

a0 = vload4(0, src_addr_a);

4649

b0 = vload4(0, src_addr_b);

4650

4651

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4652

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4653

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4654

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4655

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4656

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4657

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4659

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4660

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4661

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4662

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4663

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4664

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4665

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4666

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4667

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4668

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4669

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4670

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4671

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4672

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4673

4674

// Load values from matrix A (interleaved) and matrix B (transposed)

4675

a0 = vload4(0, src_addr_a);

4676

b0 = vload4(0, src_addr_b);

4677

4678

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4679

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4680

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4681

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4682

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4683

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4684

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4685

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4686

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4687

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4688

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4689

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4690

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4691

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4692

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4693

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4694

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4695

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4696

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4697

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4698

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4699

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4700

}

4701

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4702

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4703

{

4704

// Load values from matrix A (interleaved) and matrix B (transposed)

4705

float4 a0 = vload4(0, src_addr_a);

4706

float4 b0 = vload4(0, src_addr_b);

4707

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4708

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4709

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4710

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4711

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4712

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4713

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4714

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4715

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4716

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4717

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4718

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4719

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4720

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4721

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4722

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4723

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4724

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4725

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4726

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4727

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4728

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4729

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4730

}

4731

4732

// Compute destination address

4733

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4734

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4735

// Compute dst address

4736

__global uchar *dst_addr = offset(&dst, 0, 0);

4737

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4738

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4739

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4740

#if defined(REINTERPRET_OUTPUT_AS_3D)

4741

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4742

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4743

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4748

// |******************|

4749

// | cross_plane_pad |

4750

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4755

4756

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4757

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4758

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4759

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4760

// Add offset due to the cross plane paddings

4761

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4762

4763

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4764

// multiply dst_stride_z by DEPTH_GEMM3D

4765

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4766

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4767

// Add offset for batched GEMM

4768

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4769

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4770

4771

// Multiply by the weight of matrix-matrix product and store the result

4772

#if defined(ALPHA)

4773

SCALE_BLOCK(4, float, c, ALPHA);

4774

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4779

4780

#if defined(BROADCAST_BIAS)

4781

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4782

4783

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4784

4785

#ifndef UNIT_BETA

4786

SCALE_BLOCK(1, float, bias, BETA);

4787

#endif // UNIT_BIAS

4788

4789

// c = c + bias[broadcasted]

4790

ADD_BLOCK_BROADCAST(4, c, bias0);

4791

4792

#else // defined(BROADCAST_BIAS)

4793

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4794

2) * src2_stride_z;

4795

4796

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4797

4798

#ifndef UNIT_BETA

4799

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4804

4805

#endif // defined(BROADCAST_BIAS)

4806

#endif // defined(BETA)

4807

4808

#if defined(ACTIVATION_TYPE)

4809

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4810

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4811

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4812

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4813

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4814

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4815

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4816

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4817

}

4818

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4819

// Undefine local defines

4820

#undef COLS_MTX_B

4821

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4822

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4823

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4824

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4825

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4826

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4827

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4828

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4829

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4830

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4831

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4832

* The activation function is performed after the bias addition

4833

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4834

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4835

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4836

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4837

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4838

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4839

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4840

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4841

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4842

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4843

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4844

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4845

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4846

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4847

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4848

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4849

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4850

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4851

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4852

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4853

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4854

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4855

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4856

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4857

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4858

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4859

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4860

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4861

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4862

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4863

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4864

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4865

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4866

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4867

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4868

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4869

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

4870

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4871

#if defined(BETA)

4872

IMAGE_DECLARATION(src2),

4873

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4874

IMAGE_DECLARATION(dst),

4875

uint src0_stride_z,

4876

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4877

#if defined(BETA)

4878

uint src2_stride_z,

4879

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4880

uint dst_stride_z

4881

#if defined(REINTERPRET_OUTPUT_AS_3D)

4882

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4883

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4884

#endif // REINTERPRET_OUTPUT_AS_3D

4885

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4886

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4887

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4888

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4889

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4890

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4891

// Offset

4892

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4893

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4894

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4895

// src_addr_a = address of matrix A

4896

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4897

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4898

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4899

4900

#if defined(MATRIX_B_DEPTH)

4901

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4902

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4903

#else // defined(MATRIX_B_DEPTH)

4904

src1_addr_in_bytes += z * src1_stride_z;

4905

#endif // defined(MATRIX_B_DEPTH)

4906

4907

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4908

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4909

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4910

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4911

__global half *src_end_addr_b = src_addr_b + COLS_B;

4912

4913

src_addr_a += offset_row_a;

4914

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4915

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4916

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4921

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4922

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4923

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4924

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4925

half4 a0 = vload4(0, src_addr_a);

4926

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4927

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4928

c0 += (half8)a0.s0 * b0;

4929

c1 += (half8)a0.s1 * b0;

4930

c2 += (half8)a0.s2 * b0;

4931

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4932

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4933

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4934

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4935

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4936

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4937

c0 += (half8)a0.s0 * b0;

4938

c1 += (half8)a0.s1 * b0;

4939

c2 += (half8)a0.s2 * b0;

4940

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4941

}

4942

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4943

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4944

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4945

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4946

half4 a0 = vload4(0, src_addr_a);

4947

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4948

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4949

c0 += (half8)a0.s0 * b0;

4950

c1 += (half8)a0.s1 * b0;

4951

c2 += (half8)a0.s2 * b0;

4952

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4953

}

4954

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4955

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4956

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4957

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4958

// Compute dst address

4959

__global uchar *dst_addr = offset(&dst, 0, 0);

4960

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4961

uint4 zout = 0;

4962

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4963

#if defined(REINTERPRET_OUTPUT_AS_3D)

4964

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4965

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4966

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4971

// |******************|

4972

// | cross_plane_pad |

4973

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4978

4979

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4980

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4981

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4982

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4983

// Add offset due to the cross plane paddings

4984

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4985

4986

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4987

// multiply dst_stride_z by DEPTH_GEMM3D

4988

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4989

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4990

// Add offset for batched GEMM

4991

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4992

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4993

4994

// Multiply by the weight of matrix-matrix product and store the result

4995

#if defined(ALPHA)

4996

SCALE_BLOCK(4, half, c, ALPHA);

4997

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5002

5003

#if defined(BROADCAST_BIAS)

5004

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5005

5006

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5007

5008

#ifndef UNIT_BETA

5009

SCALE_BLOCK(1, half, bias, BETA);

5010

#endif // UNIT_BIAS

5011

5012

// c = c + bias[broadcasted]

5013

ADD_BLOCK_BROADCAST(4, c, bias0);

5014

5015

#else // defined(BROADCAST_BIAS)

5016

5017

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5018

2) * src2_stride_z;

5019

5020

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5021

5022

#ifndef UNIT_BETA

5023

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5028

5029

#endif // defined(BROADCAST_BIAS)

5030

#endif // defined(BETA)

5031

5032

#if defined(ACTIVATION_TYPE)

5033

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5034

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5035

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5036

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5037

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5038

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5039

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5040

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5041

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5042

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5043

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5044

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5045

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5046

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5047

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5048

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5049

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5050

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5051

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5052

* The activation function is performed after the bias addition

5053

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5054

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5055

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5056

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5057

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5058

*

5059

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5060

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5061

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5062

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5063

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5064

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5065

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5066

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5067

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5068

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5069

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5070

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5071

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5072

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5073

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5074

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5075

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5076

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5077

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5078

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5079

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5080

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5081

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5082

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5083

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5084

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5085

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5086

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5087

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5088

*/

5089

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

5090

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5091

#if defined(BETA)

5092

IMAGE_DECLARATION(src2),

5093

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5094

IMAGE_DECLARATION(dst),

5095

uint src0_stride_z,

5096

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5097

#if defined(BETA)

5098

uint src2_stride_z,

5099

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5100

uint dst_stride_z

5101

#if defined(REINTERPRET_OUTPUT_AS_3D)

5102

,

5103

uint cross_plane_pad

5104

#endif // REINTERPRET_OUTPUT_AS_3D

5105

)

5106

{

5107

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5108

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5109

int z = get_global_id(2);

5110

5111

// Offset

5112

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5113

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5114

5115

// src_addr_a = address of matrix A

5116

// src_addr_b = address of matrix B

5117

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5118

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5119

5120

#if defined(MATRIX_B_DEPTH)

5121

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5122

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5123

#else // defined(MATRIX_B_DEPTH)

5124

src1_addr_in_bytes += z * src1_stride_z;

5125

#endif // defined(MATRIX_B_DEPTH)

5126

5127

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5128

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5129

5130

// Compute end row address for matrix B

5131

__global half *src_end_addr_b = src_addr_b + COLS_B;

5132

5133

src_addr_a += offset_row_a;

5134

src_addr_b += offset_row_b;

5135

5136

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5141

5142

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

5143

{

5144

// Load values from matrix A (interleaved) and matrix B (transposed)

5145

float4 a0 = convert_float4(vload4(0, src_addr_a));

5146

float8 b0 = convert_float8(vload8(0, src_addr_b));

5147

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5148

c0 += (float8)a0.s0 * b0;

5149

c1 += (float8)a0.s1 * b0;

5150

c2 += (float8)a0.s2 * b0;

5151

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5152

5153

// Load values from matrix A (interleaved) and matrix B (transposed)

5154

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

5155

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

5156

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5157

c0 += (float8)a0.s0 * b0;

5158

c1 += (float8)a0.s1 * b0;

5159

c2 += (float8)a0.s2 * b0;

5160

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5161

}

5162

5163

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

5164

{

5165

// Load values from matrix A (interleaved) and matrix B (transposed)

5166

float4 a0 = convert_float4(vload4(0, src_addr_a));

5167

float8 b0 = convert_float8(vload8(0, src_addr_b));

5168

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5169

c0 += (float8)a0.s0 * b0;

5170

c1 += (float8)a0.s1 * b0;

5171

c2 += (float8)a0.s2 * b0;

5172

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5173

}

5174

5175

// Compute destination address

5176

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5177

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5178

// Compute dst address

5179

__global uchar *dst_addr = offset(&dst, 0, 0);

5180

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5181

uint4 zout = 0;

5182

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5183

#if defined(REINTERPRET_OUTPUT_AS_3D)

5184

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5185

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5191

// |******************|

5192

// | cross_plane_pad |

5193

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5198

5199

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5200

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5201

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5202

5203

// Add offset due to the cross plane paddings

5204

zout *= (cross_plane_pad * dst_stride_y);

5205

5206

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5207

// multiply dst_stride_z by DEPTH_GEMM3D

5208

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5209

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5210

// Add offset for batched GEMM

5211

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5212

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5213

5214

// Multiply by the weight of matrix-matrix product and store the result

5215

#if defined(ALPHA)

5216

SCALE_BLOCK(4, float, c, ALPHA);

5217

#endif // defined(ALPHA)

5218

5219

#if defined(BETA)

5220

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5221

5222

#if defined(BROADCAST_BIAS)

5223

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5224

5225

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5226

5227

float8 bias_f0 = convert_float8(bias0);

5228

5229

#ifndef UNIT_BETA

5230

SCALE_BLOCK(1, float, bias_f, BETA);

5231

#endif // UNIT_BIAS

5232

5233

// c = c + bias[broadcasted]

5234

ADD_BLOCK_BROADCAST(4, c, bias_f0);

5235

5236

#else // defined(BROADCAST_BIAS)

5237

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5238

2) * src2_stride_z;

5239

5240

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5241

5242

float8 bias_f0 = convert_float8(bias0);

5243

float8 bias_f1 = convert_float8(bias1);

5244

float8 bias_f2 = convert_float8(bias2);

5245

float8 bias_f3 = convert_float8(bias3);

5246

5247

#ifndef UNIT_BETA

5248

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

5253

5254

#endif // defined(BROADCAST_BIAS)

5255

#endif // defined(BETA)

5256

5257

half8 c_h0 = convert_half8(c0);

5258

half8 c_h1 = convert_half8(c1);

5259

half8 c_h2 = convert_half8(c2);

5260

half8 c_h3 = convert_half8(c3);

5261

5262

#if defined(ACTIVATION_TYPE)

5263

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

5264

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5265

5266

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5267

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5268

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5269

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5270

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5271

}

5272

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5273

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5274

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5275

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5276

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5277

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5278

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5279

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5280

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5281

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5282

* The activation function is performed after the bias addition

5283

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5284

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5285

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5286

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5287

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5288

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5289

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5290

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5291

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5292

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5293

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5294

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5295

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5296

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5297

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5298

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5299

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5300

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5301

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5302

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5303

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5304

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5305

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5306

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5307

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5308

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5309

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5310

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5311

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5312

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5313

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5314

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5315

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5316

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5317

*/

5318

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

5319

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5320

#if defined(BETA)

5321

IMAGE_DECLARATION(src2),

5322

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5323

IMAGE_DECLARATION(dst),

5324

uint src0_stride_z,

5325

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5326

#if defined(BETA)

5327

uint src2_stride_z,

5328

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5329

uint dst_stride_z

5330

#if defined(REINTERPRET_OUTPUT_AS_3D)

5331

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5332

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5333

#endif // REINTERPRET_OUTPUT_AS_3D

5334

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5335

{

5336

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5337

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5338

int z = get_global_id(2);

5339

5340

// Offset

5341

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5342

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5343

5344

// src_addr_a = address of matrix A

5345

// src_addr_b = address of matrix B

5346

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5347

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5348

5349

#if defined(MATRIX_B_DEPTH)

5350

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5351

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5352

#else // defined(MATRIX_B_DEPTH)

5353

src1_addr_in_bytes += z * src1_stride_z;

5354

#endif // defined(MATRIX_B_DEPTH)

5355

5356

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5357

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5358

5359

// Compute end row address for matrix B

5360

__global half *src_end_addr_b = src_addr_b + COLS_B;

5361

5362

src_addr_a += offset_row_a;

5363

src_addr_b += offset_row_b;

5364

5365

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5370

5371

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

5372

5373

int i = 0;

5374

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

5375

{

5376

#if MULT_INTERLEAVE4X4_HEIGHT == 1

5377

// Load values from matrix A (interleaved) and matrix B (transposed)

5378

half8 a0 = vload8(0, src_addr_a);

5379

half8 b0 = vload8(0, src_addr_b);

5380

5381

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5382

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5383

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5384

c0 = fma((half8)a0.s0, b0, c0);

5385

c1 = fma((half8)a0.s1, b0, c1);

5386

c2 = fma((half8)a0.s2, b0, c2);

5387

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5388

5389

// Load values from matrix B (transposed)

5390

b0 = vload8(0, src_addr_b);

5391

5392

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5393

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5394

c0 = fma((half8)a0.s4, b0, c0);

5395

c1 = fma((half8)a0.s5, b0, c1);

5396

c2 = fma((half8)a0.s6, b0, c2);

5397

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5398

5399

// Load values from matrix A (interleaved) and matrix B (transposed)

5400

a0 = vload8(0, src_addr_a);

5401

b0 = vload8(0, src_addr_b);

5402

5403

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5404

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5405

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5406

c0 = fma((half8)a0.s0, b0, c0);

5407

c1 = fma((half8)a0.s1, b0, c1);

5408

c2 = fma((half8)a0.s2, b0, c2);

5409

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5410

5411

// Load values from matrix B (transposed)

5412

b0 = vload8(0, src_addr_b);

5413

5414

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5415

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5416

c0 = fma((half8)a0.s4, b0, c0);

5417

c1 = fma((half8)a0.s5, b0, c1);

5418

c2 = fma((half8)a0.s6, b0, c2);

5419

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5420

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

5421

// Load values from matrix A (interleaved) and matrix B (transposed)

5422

half4 a0 = vload4(0, src_addr_a);

5423

half8 b0 = vload8(0, src_addr_b);

5424

5425

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5426

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5427

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5428

c0 = fma((half8)a0.s0, b0, c0);

5429

c1 = fma((half8)a0.s1, b0, c1);

5430

c2 = fma((half8)a0.s2, b0, c2);

5431

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5432

5433

// Load values from matrix A (interleaved) and matrix B (transposed)

5434

a0 = vload4(0, src_addr_a);

5435

b0 = vload8(0, src_addr_b);

5436

5437

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5438

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5439

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5440

c0 = fma((half8)a0.s0, b0, c0);

5441

c1 = fma((half8)a0.s1, b0, c1);

5442

c2 = fma((half8)a0.s2, b0, c2);

5443

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5444

5445

// Load values from matrix A (interleaved) and matrix B (transposed)

5446

a0 = vload4(0, src_addr_a);

5447

b0 = vload8(0, src_addr_b);

5448

5449

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5450

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5451

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5452

c0 = fma((half8)a0.s0, b0, c0);

5453

c1 = fma((half8)a0.s1, b0, c1);

5454

c2 = fma((half8)a0.s2, b0, c2);

5455

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5456

5457

// Load values from matrix A (interleaved) and matrix B (transposed)

5458

a0 = vload4(0, src_addr_a);

5459

b0 = vload8(0, src_addr_b);

5460

5461

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5462

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5463

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5464

c0 = fma((half8)a0.s0, b0, c0);

5465

c1 = fma((half8)a0.s1, b0, c1);

5466

c2 = fma((half8)a0.s2, b0, c2);

5467

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5468

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

5469

}

5470

5471

for(; i < (int)(COLS_MTX_B); ++i)

5472

{

5473

// Load values from matrix A (interleaved) and matrix B (transposed)

5474

half4 a0 = vload4(0, src_addr_a);

5475

half8 b0 = vload8(0, src_addr_b);

5476

5477

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5478

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5479

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5480

c0 = fma((half8)a0.s0, b0, c0);

5481

c1 = fma((half8)a0.s1, b0, c1);

5482

c2 = fma((half8)a0.s2, b0, c2);

5483

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5484

}

5485

5486

// Compute destination address

5487

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5488

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5489

// Compute dst address

5490

__global uchar *dst_addr = offset(&dst, 0, 0);

5491

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5492

uint4 zout = 0;

5493

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5494

#if defined(REINTERPRET_OUTPUT_AS_3D)

5495

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5496

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5497

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5502

// |******************|

5503

// | cross_plane_pad |

5504

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5509

5510

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5511

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5512

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5513

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5514

// Add offset due to the cross plane paddings

5515

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5516

5517

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5518

// multiply dst_stride_z by DEPTH_GEMM3D

5519

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5520

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5521

// Add offset for batched GEMM

5522

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5523

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5524

5525

// Multiply by the weight of matrix-matrix product and store the result

5526

#if defined(ALPHA)

5527

SCALE_BLOCK(4, half, c, ALPHA);

5528

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5533

5534

#if defined(BROADCAST_BIAS)

5535

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5536

5537

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5538

5539

#ifndef UNIT_BETA

5540

SCALE_BLOCK(1, half, bias, BETA);

5541

#endif // UNIT_BIAS

5542

5543

// c = c + bias[broadcasted]

5544

ADD_BLOCK_BROADCAST(4, c, bias0);

5545

5546

#else // defined(BROADCAST_BIAS)

5547

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5548

2) * src2_stride_z;

5549

5550

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5551

5552

#ifndef UNIT_BETA

5553

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5558

5559

#endif // defined(BROADCAST_BIAS)

5560

#endif // defined(BETA)

5561

5562

#if defined(ACTIVATION_TYPE)

5563

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5564

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5565

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5566

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5567

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5568

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5569

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5570

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5571

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

5572

5573

// Undefine local defines

5574

#undef COLS_MTX_B

5575

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

5576

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5577

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

5578

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5579

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5580

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

5581

#if defined(DATA_TYPE)

5582

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5583

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

5584

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5585

* @note This OpenCL kernel works with floating point data types (F16/F32)

5586

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

5587

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5588

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5589

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5590

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5591

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5592

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5593

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5594

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5595

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5596

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5597

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5598

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5599

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5600

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5601

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5602

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5603

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5604

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5605

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5606

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5607

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5608

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5609

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5610

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5611

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5612

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5613

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5614

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5615

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5616

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5617

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5618

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5619

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5620

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5621

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5622

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5623

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5624

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5625

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5626

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5627

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5628

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5629

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5630

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5631

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5632

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

5633

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5634

#if defined(BETA)

5635

IMAGE_DECLARATION(src2),

5636

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5637

IMAGE_DECLARATION(dst),

5638

uint src0_stride_z,

5639

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5640

#if defined(BETA)

5641

uint src2_stride_z,

5642

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5643

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5644

#if defined(REINTERPRET_INPUT_AS_3D)

5645

,

5646

uint src_cross_plane_pad

5647

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5648

#if defined(REINTERPRET_OUTPUT_AS_3D)

5649

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5650

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5651

#endif // REINTERPRET_OUTPUT_AS_3D

5652

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5653

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5654

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5655

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5656

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5657

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5658

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5659

// Update address for the matrix A

5660

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5661

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5662

// Update address for the matrix B

5663

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5664

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5665

#if defined(REINTERPRET_INPUT_AS_3D)

5666

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5667

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5673

// |******************|

5674

// | cross_plane_pad |

5675

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5680

5681

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5682

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5683

zin = min(DEPTH_GEMM3D - 1, zin);

5684

5685

// Add offset due to the cross plane paddings

5686

zin *= (src_cross_plane_pad * src0_stride_y);

5687

5688

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5689

// multiply src0_stride_z by DEPTH_GEMM3D

5690

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5691

5692

#else // defined(REINTERPRET_INPUT_AS_3D)

5693

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5694

// Add offset for batched GEMM

5695

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5696

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5697

#endif // defined(REINTERPRET_INPUT_AS_3D)

5698

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5699

#if defined(MATRIX_B_DEPTH)

5700

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5701

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5702

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5703

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5704

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5705

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5706

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

5707

5708

VECTOR_TYPE acc0 = 0.0f;

5709

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5710

VECTOR_TYPE acc1 = 0.0f;

5711

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5712

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5713

VECTOR_TYPE acc2 = 0.0f;

5714

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5715

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5716

VECTOR_TYPE acc3 = 0.0f;

5717

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5718

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

5719

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5720

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5721

#if defined(REINTERPRET_INPUT_AS_3D)

5722

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5723

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5724

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5725

// Load values from matrix A

5726

VEC_DATA_TYPE(DATA_TYPE, 2)

5727

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5728

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5729

VEC_DATA_TYPE(DATA_TYPE, 2)

5730

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5731

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5732

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5733

VEC_DATA_TYPE(DATA_TYPE, 2)

5734

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5735

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5737

VEC_DATA_TYPE(DATA_TYPE, 2)

5738

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5739

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5740

#endif // defined(REINTERPRET_INPUT_AS_3D)

5741

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5742

// Load values from matrix B

5743

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

5744

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5745

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5746

// Accumulate

5747

acc0 += b0 * (VECTOR_TYPE)a0.s0;

5748

acc0 += b1 * (VECTOR_TYPE)a0.s1;

5749

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5750

acc1 += b0 * (VECTOR_TYPE)a1.s0;

5751

acc1 += b1 * (VECTOR_TYPE)a1.s1;

5752

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5753

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5754

acc2 += b0 * (VECTOR_TYPE)a2.s0;

5755

acc2 += b1 * (VECTOR_TYPE)a2.s1;

5756

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5757

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5758

acc3 += b0 * (VECTOR_TYPE)a3.s0;

5759

acc3 += b1 * (VECTOR_TYPE)a3.s1;

5760

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5761

}

5762

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5763

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5764

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5765

#if defined(REINTERPRET_INPUT_AS_3D)

5766

// Load values from matrix A

5767

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5768

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5769

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5770

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5771

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5772

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5773

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5774

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5775

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5776

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5777

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5778

// Load values from matrix A

5779

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5780

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5781

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5782

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5783

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5784

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5785

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5786

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5787

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5788

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5789

#endif // defined(REINTERPRET_INPUT_AS_3D)

5790

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5791

// Load values from matrix B

5792

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5793

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5794

// Accumulate

5795

acc0 += b0 * (VECTOR_TYPE)a0;

5796

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5797

acc1 += b0 * (VECTOR_TYPE)a1;

5798

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5799

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5800

acc2 += b0 * (VECTOR_TYPE)a2;

5801

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5802

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5803

acc3 += b0 * (VECTOR_TYPE)a3;

5804

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5805

}

5806

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5807

int z = get_global_id(2);

5808

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5809

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5810

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5811

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5812

// Compute dst address

5813

__global uchar *dst_addr = offset(&dst, 0, 0);

5814

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5815

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5816

5817

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5818

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5819

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5820

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5821

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5826

// |******************|

5827

// | cross_plane_pad |

5828

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5833

5834

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5835

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5836

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5837

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5838

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5839

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5840

5841

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5842

// multiply dst_stride_z by DEPTH_GEMM3D

5843

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5844

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5845

// Add offset for batched GEMM

5846

dst_addr += z * dst_stride_z;

5847

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5848

5849

// Multiply by the weight of matrix-matrix product and store the result

5850

#if defined(ALPHA)

5851

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

5852

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5857

5858

#if defined(BROADCAST_BIAS)

5859

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

5860

5861

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5862

5863

#ifndef UNIT_BETA

5864

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

5865

#endif // UNIT_BIAS

5866

5867

// c = c + bias[broadcasted]

5868

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5869

5870

#else // defined(BROADCAST_BIAS)

5871

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

5872

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5873

5874

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5875

5876

#ifndef UNIT_BETA

5877

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5882

5883

#endif // defined(BROADCAST_BIAS)

5884

#endif // defined(BETA)

5885

5886

#if defined(ACTIVATION_TYPE)

5887

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

5888

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5889

5890

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5891

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5892

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5893

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5894

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

5895

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5896

*

5897

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5898

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5899

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5900

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5901

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5902

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5903

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5904

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5905

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5906

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5907

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5908

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5909

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5910

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5911

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5912

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5913

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5914

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5915

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5916

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5917

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5918

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5919

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5920

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5921

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5922

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5923

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5924

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5925

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5926

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5927

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5928

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5929

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5930

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5931

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5932

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5933

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5934

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5935

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5936

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5937

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5938

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5939

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5940

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5941

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5942

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5943

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5944

*/

5945

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

5946

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5947

#if defined(BETA)

5948

IMAGE_DECLARATION(src2),

5949

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5950

IMAGE_DECLARATION(dst),

5951

uint src0_stride_z,

5952

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5953

#if defined(BETA)

5954

uint src2_stride_z,

5955

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5956

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5957

#if defined(REINTERPRET_INPUT_AS_3D)

5958

,

5959

uint src_cross_plane_pad

5960

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5961

#if defined(REINTERPRET_OUTPUT_AS_3D)

5962

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5963

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5964

#endif // REINTERPRET_OUTPUT_AS_3D

5965

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5966

{

5967

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5968

5969

// Compute starting address for matrix A and matrix B

5970

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5971

5972

// Update address for matrix A

5973

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5974

5975

// Update address for matrix B

5976

src_addr.s1 += idx * sizeof(float);

5977

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5978

#if defined(REINTERPRET_INPUT_AS_3D)

5979

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5980

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5986

// |******************|

5987

// | cross_plane_pad |

5988

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5993

5994

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5995

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5996

zin = min(DEPTH_GEMM3D - 1, zin);

5997

5998

// Add offset due to the cross plane paddings

5999

zin *= (src_cross_plane_pad * src0_stride_y);

6000

6001

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6002

// multiply src0_stride_z by DEPTH_GEMM3D

6003

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6004

6005

#else // defined(REINTERPRET_INPUT_AS_3D)

6006

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6007

// Add offset for batched GEMM

6008

src_addr.s0 += get_global_id(2) * src0_stride_z;

6009

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6010

#endif // defined(REINTERPRET_INPUT_AS_3D)

6011

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6012

#if defined(MATRIX_B_DEPTH)

6013

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6014

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6015

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6016

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6017

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6018

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6019

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6020

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6021

6022

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6023

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6024

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6025

6026

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6027

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6028

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6029

6030

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6031

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6032

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6033

6034

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6035

int i = 0;

6036

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6037

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6038

#if defined(REINTERPRET_INPUT_AS_3D)

6039

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6040

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6041

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6042

// Load values from matrix A and matrix B

6043

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6044

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6045

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6046

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6047

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6048

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6049

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6050

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6051

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6052

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6053

#endif // defined(REINTERPRET_INPUT_AS_3D)

6054

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6055

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6056

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6057

6058

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6059

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6060

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6061

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

6062

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6063

6064

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6065

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6066

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

6067

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

6068

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

6069

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6070

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6072

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6073

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6074

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

6075

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

6076

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

6077

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6078

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6079

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6080

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6081

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6082

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

6083

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

6084

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

6085

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6086

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6087

6088

// Load values from matrix A and matrix B

6089

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6090

src_addr.s1 += src1_stride_y;

6091

6092

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6093

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

6094

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

6095

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

6096

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6097

6098

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6099

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6100

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

6101

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

6102

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

6103

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6104

6105

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6106

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6107

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6108

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

6109

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

6110

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

6111

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6112

6113

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6114

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6115

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6116

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

6117

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

6118

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

6119

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6120

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6121

6122

// Load values from matrix A and matrix B

6123

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6124

src_addr.s1 += src1_stride_y;

6125

6126

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6127

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

6128

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

6129

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

6130

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6131

6132

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6133

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6134

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

6135

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

6136

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

6137

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6138

6139

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6140

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6141

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6142

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

6143

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

6144

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

6145

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6146

6147

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6148

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6149

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6150

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

6151

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

6152

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

6153

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6154

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6155

6156

// Load values from matrix A and matrix B

6157

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6158

src_addr.s1 += src1_stride_y;

6159

6160

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6161

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

6162

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

6163

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

6164

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6165

6166

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6167

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6168

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

6169

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

6170

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

6171

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6172

6173

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6174

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6175

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6176

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

6177

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

6178

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

6179

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6180

6181

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6182

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6183

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6184

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

6185

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

6186

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

6187

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6188

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6189

6190

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6191

}

6192

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6193

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6194

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6195

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6196

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6197

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6198

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6199

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6200

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6201

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6202

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6203

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6204

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6205

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6206

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6207

#else // defined(REINTERPRET_INPUT_AS_3D)

6208

// Load values from matrix A

6209

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6210

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6211

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6212

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6213

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6214

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6215

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6216

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6217

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6218

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6219

#endif // defined(REINTERPRET_INPUT_AS_3D)

6220

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6221

// Load values from matrix B

6222

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6223

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6224

6225

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6226

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6227

acc0.s1 = fma(a0, b0.s1, acc0.s1);

6228

acc0.s2 = fma(a0, b0.s2, acc0.s2);

6229

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6230

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6231

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6232

acc1.s1 = fma(a1, b0.s1, acc1.s1);

6233

acc1.s2 = fma(a1, b0.s2, acc1.s2);

6234

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6235

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6236

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6237

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6238

acc2.s1 = fma(a2, b0.s1, acc2.s1);

6239

acc2.s2 = fma(a2, b0.s2, acc2.s2);

6240

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6241

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6242

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6243

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6244

acc3.s1 = fma(a3, b0.s1, acc3.s1);

6245

acc3.s2 = fma(a3, b0.s2, acc3.s2);

6246

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6247

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6248

6249

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6250

}

6251

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6252

int z = get_global_id(2);

6253

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6254

// Compute destination address

6255

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6256

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6257

// Compute dst address

6258

__global uchar *dst_addr = offset(&dst, 0, 0);

6259

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6260

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6261

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6262

#if defined(REINTERPRET_OUTPUT_AS_3D)

6263

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6264

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6265

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6270

// |******************|

6271

// | cross_plane_pad |

6272

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6277

6278

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6279

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6280

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6281

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6282

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6283

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6284

6285

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6286

// multiply dst_stride_z by DEPTH_GEMM3D

6287

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6288

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6289

// Add offset for batched GEMM

6290

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6291

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6292

6293

// Multiply by the weight of matrix-matrix product and store the result

6294

#if defined(ALPHA)

6295

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6296

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6301

6302

#if defined(BROADCAST_BIAS)

6303

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

6304

6305

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6306

6307

#ifndef UNIT_BETA

6308

SCALE_BLOCK(1, float, bias, BETA);

6309

#endif // UNIT_BIAS

6310

6311

// acc = acc + bias[broadcasted]

6312

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6313

6314

#else // defined(BROADCAST_BIAS)

6315

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

6316

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6317

6318

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6319

6320

#ifndef UNIT_BETA

6321

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6326

6327

#endif // defined(BROADCAST_BIAS)

6328

#endif // defined(BETA)

6329

6330

#if defined(ACTIVATION_TYPE)

6331

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6332

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6333

6334

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6335

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6336

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6337

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6338

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6339

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6340

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6341

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6342

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6343

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6344

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6345

}

6346

6347

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

6348

*

6349

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

6350

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

6351

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6352

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

6353

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6354

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6355

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6356

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6357

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6358

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6359

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6360

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6361

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6362

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6363

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6364

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6365

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6366

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6367

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6368

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6369

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6370

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6371

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6372

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6373

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6374

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6375

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6376

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6377

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6378

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6379

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6380

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6381

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6382

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6383

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6384

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6385

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6386

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6387

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6388

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6389

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6390

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6391

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6392

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6393

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6394

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6395

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6396

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6397

*/

6398

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

6399

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6400

#if defined(BETA)

6401

IMAGE_DECLARATION(src2),

6402

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6403

IMAGE_DECLARATION(dst),

6404

uint src0_stride_z,

6405

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6406

#if defined(BETA)

6407

uint src2_stride_z,

6408

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6409

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6410

#if defined(REINTERPRET_INPUT_AS_3D)

6411

,

6412

uint src_cross_plane_pad

6413

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6414

#if defined(REINTERPRET_OUTPUT_AS_3D)

6415

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6416

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6417

#endif // REINTERPRET_OUTPUT_AS_3D

6418

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6419

{

6420

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6421

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6422

6423

// Compute starting address for matrix A and Matrix B

6424

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6425

6426

// Update address for the matrix A

6427

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6428

6429

// Update address for the matrix B

6430

src_addr.s1 += idx * sizeof(float);

6431

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6432

#if defined(REINTERPRET_INPUT_AS_3D)

6433

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6434

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6440

// |******************|

6441

// | cross_plane_pad |

6442

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6447

6448

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6449

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6450

zin = min(DEPTH_GEMM3D - 1, zin);

6451

6452

// Add offset due to the cross plane paddings

6453

zin *= (src_cross_plane_pad * src0_stride_y);

6454

6455

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6456

// multiply src0_stride_z by DEPTH_GEMM3D

6457

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6458

6459

#else // defined(REINTERPRET_INPUT_AS_3D)

6460

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6461

// Add offset for batched GEMM

6462

src_addr.s0 += get_global_id(2) * src0_stride_z;

6463

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6464

#endif // defined(REINTERPRET_INPUT_AS_3D)

6465

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6466

#if defined(MATRIX_B_DEPTH)

6467

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6468

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6469

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6470

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6471

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6472

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6473

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6474

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6475

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6476

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6477

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6478

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6479

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6480

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6481

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6482

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6483

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6484

6485

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6486

int i = 0;

6487

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6488

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6489

#if defined(REINTERPRET_INPUT_AS_3D)

6490

// Load values from matrix A

6491

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

6492

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6493

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6494

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6495

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6496

6497

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6498

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6499

src_addr.s1 += src1_stride_y;

6500

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6501

src_addr.s1 += src1_stride_y;

6502

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6503

src_addr.s1 += src1_stride_y;

6504

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6505

src_addr.s1 += src1_stride_y;

6506

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6507

src_addr.s1 += src1_stride_y;

6508

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6509

src_addr.s1 += src1_stride_y;

6510

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6511

src_addr.s1 += src1_stride_y;

6512

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6513

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6514

6515

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6516

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6517

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

6518

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

6519

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

6520

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

6521

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

6522

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

6523

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6524

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6525

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6526

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

6527

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

6528

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

6529

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

6530

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

6531

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

6532

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6533

6534

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6535

#if defined(REINTERPRET_INPUT_AS_3D)

6536

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6537

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6538

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6539

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6540

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

6541

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

6542

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

6543

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

6544

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

6545

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

6546

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

6547

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6548

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6549

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

6550

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

6551

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

6552

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

6553

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

6554

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

6555

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

6556

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6557

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6558

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6559

#if defined(REINTERPRET_INPUT_AS_3D)

6560

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6561

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6562

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6563

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6564

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

6565

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

6566

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

6567

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

6568

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

6569

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

6570

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

6571

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6572

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6573

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

6574

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

6575

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

6576

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

6577

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

6578

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

6579

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

6580

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6581

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6582

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6583

#if defined(REINTERPRET_INPUT_AS_3D)

6584

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6585

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6586

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6587

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6588

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

6589

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

6590

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

6591

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

6592

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

6593

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

6594

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

6595

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6596

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6597

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

6598

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

6599

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

6600

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

6601

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

6602

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

6603

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

6604

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6605

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6606

6607

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6608

}

6609

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6610

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6611

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6612

#if defined(REINTERPRET_INPUT_AS_3D)

6613

// Load values from matrix A

6614

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6615

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6616

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6617

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6618

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6619

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6620

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6621

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6622

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6623

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6624

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6625

// Load values from matrix A

6626

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6627

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6628

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6629

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6630

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6631

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6632

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6633

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6634

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6635

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6636

#endif // defined(REINTERPRET_INPUT_AS_3D)

6637

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6638

// Load values from matrix B

6639

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6640

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6641

6642

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6643

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6644

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6645

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6646

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6647

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6648

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6649

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6650

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6651

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6652

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6653

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6654

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6655

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6656

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6657

6658

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6659

}

6660

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6661

int z = get_global_id(2);

6662

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6663

// Compute destination address

6664

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6665

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6666

// Compute dst address

6667

__global uchar *dst_addr = offset(&dst, 0, 0);

6668

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6669

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6670

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6671

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6672

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6673

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6674

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6675

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6680

// |******************|

6681

// | cross_plane_pad |

6682

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6687

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6688

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6689

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6690

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6691

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6692

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6693

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6694

6695

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6696

// multiply dst_stride_z by DEPTH_GEMM3D

6697

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6698

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6699

// Add offset for batched GEMM

6700

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6701

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6702

6703

// Multiply by the weight of matrix-matrix product and store the result

6704

#if defined(ALPHA)

6705

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6706

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6711

6712

#if defined(BROADCAST_BIAS)

6713

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

6714

6715

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6716

6717

#ifndef UNIT_BETA

6718

SCALE_BLOCK(1, float, bias, BETA);

6719

#endif // UNIT_BIAS

6720

6721

// acc = acc + bias[broadcasted]

6722

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6723

6724

#else // defined(BROADCAST_BIAS)

6725

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

6726

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6727

6728

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6729

6730

#ifndef UNIT_BETA

6731

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6736

6737

#endif // defined(BROADCAST_BIAS)

6738

#endif // defined(BETA)

6739

6740

#if defined(ACTIVATION_TYPE)

6741

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6742

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6743

6744

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6745

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6746

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6747

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6748

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6749

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6750

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6751

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6752

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6753

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6754

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6755

}

6756

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6757

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6758

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

6759

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6760

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

6761

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6762

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

6763

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6764

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6765

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6766

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6767

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6768

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6769

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6770

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6771

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

6772

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6773

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6774

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6775

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6776

*

6777

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

6778

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6779

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6780

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6781

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6782

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6783

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6784

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6785

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6786

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6787

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6788

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6789

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6790

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6791

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6792

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6793

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6794

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6795

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6796

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6797

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6798

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6799

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6800

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6801

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6802

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6803

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6804

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6805

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6806

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

6807

*/

6808

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

6809

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6810

#if defined(BETA)

6811

IMAGE_DECLARATION(src2),

6812

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6813

IMAGE_DECLARATION(dst),

6814

uint src0_stride_z,

6815

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6816

#if defined(BETA)

6817

uint src2_stride_z,

6818

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6819

uint dst_stride_z

6820

#if defined(REINTERPRET_INPUT_AS_3D)

6821

,

6822

uint src_cross_plane_pad

6823

#endif // REINTERPRET_INPUT_AS_3D

6824

#if defined(REINTERPRET_OUTPUT_AS_3D)

6825

,

6826

uint dst_cross_plane_pad

6827

#endif // REINTERPRET_OUTPUT_AS_3D

6828

)

6829

{

6830

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6831

6832

// Compute starting address for matrix A and Matrix B

6833

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6834

6835

// Update address for the matrix A

6836

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6837

6838

// Update address for the matrix B

6839

src_addr.s1 += idx * sizeof(half);

6840

6841

#if defined(REINTERPRET_INPUT_AS_3D)

6842

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6843

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6849

// |******************|

6850

// | cross_plane_pad |

6851

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6856

6857

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6858

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6859

zin = min(DEPTH_GEMM3D - 1, zin);

6860

6861

// Add offset due to the cross plane paddings

6862

zin *= (src_cross_plane_pad * src0_stride_y);

6863

6864

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6865

// multiply src0_stride_z by DEPTH_GEMM3D

6866

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6867

6868

#else // defined(REINTERPRET_INPUT_AS_3D)

6869

6870

// Add offset for batched GEMM

6871

src_addr.s0 += get_global_id(2) * src0_stride_z;

6872

6873

#endif // defined(REINTERPRET_INPUT_AS_3D)

6874

6875

#if defined(MATRIX_B_DEPTH)

6876

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6877

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6878

#else // defined(MATRIX_B_DEPTH)

6879

src_addr.s1 += get_global_id(2) * src1_stride_z;

6880

#endif // defined(MATRIX_B_DEPTH)

6881

6882

float8 acc0 = 0.0h;

6883

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6884

float8 acc1 = 0.0h;

6885

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6886

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6887

float8 acc2 = 0.0h;

6888

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6889

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6890

float8 acc3 = 0.0h;

6891

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6892

6893

int i = 0;

6894

for(; i <= ((int)COLS_A - 4); i += 4)

6895

{

6896

#if defined(REINTERPRET_INPUT_AS_3D)

6897

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6898

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6899

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6900

// Load values from matrix A

6901

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6902

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6903

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6904

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6905

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6906

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6907

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6908

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6909

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6910

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6911

#endif // defined(REINTERPRET_INPUT_AS_3D)

6912

6913

// Load values from matrix B

6914

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6915

src_addr.s1 += src1_stride_y;

6916

6917

// Accumulate

6918

acc0 = fma(b0, (float8)a0.s0, acc0);

6919

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6920

acc1 = fma(b0, (float8)a1.s0, acc1);

6921

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6922

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6923

acc2 = fma(b0, (float8)a2.s0, acc2);

6924

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6925

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6926

acc3 = fma(b0, (float8)a3.s0, acc3);

6927

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6928

6929

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6930

src_addr.s1 += src1_stride_y;

6931

acc0 = fma(b0, (float8)a0.s1, acc0);

6932

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6933

acc1 = fma(b0, (float8)a1.s1, acc1);

6934

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6935

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6936

acc2 = fma(b0, (float8)a2.s1, acc2);

6937

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6938

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6939

acc3 = fma(b0, (float8)a3.s1, acc3);

6940

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6941

6942

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6943

src_addr.s1 += src1_stride_y;

6944

acc0 = fma(b0, (float8)a0.s2, acc0);

6945

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6946

acc1 = fma(b0, (float8)a1.s2, acc1);

6947

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6948

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6949

acc2 = fma(b0, (float8)a2.s2, acc2);

6950

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6951

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6952

acc3 = fma(b0, (float8)a3.s2, acc3);

6953

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6954

6955

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6956

src_addr.s1 += src1_stride_y;

6957

acc0 = fma(b0, (float8)a0.s3, acc0);

6958

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6959

acc1 = fma(b0, (float8)a1.s3, acc1);

6960

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6961

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6962

acc2 = fma(b0, (float8)a2.s3, acc2);

6963

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6964

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6965

acc3 = fma(b0, (float8)a3.s3, acc3);

6966

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6967

6968

src_addr.s0 += 4 * sizeof(half);

6969

}

6970

6971

for(; i < (int)COLS_A; ++i)

6972

{

6973

#if defined(REINTERPRET_INPUT_AS_3D)

6974

// Load values from matrix A

6975

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6976

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6977

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6978

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6979

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6980

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6981

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6982

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6983

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6984

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6985

#else // defined(REINTERPRET_INPUT_AS_3D)

6986

// Load values from matrix A

6987

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6988

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6989

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6990

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6991

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6992

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6993

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6994

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6995

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6996

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6997

#endif // defined(REINTERPRET_INPUT_AS_3D)

6998

6999

// Load values from matrix B

7000

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

7001

7002

src_addr += (int2)(sizeof(half), src1_stride_y);

7003

7004

// Accumulate

7005

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

7006

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7007

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

7008

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7010

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

7011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7012

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7013

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

7014

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7015

}

7016

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7017

int z = get_global_id(2);

7018

7019

// Compute destination address

7020

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7021

7022

// Compute dst address

7023

__global uchar *dst_addr = offset(&dst, 0, 0);

7024

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7025

uint4 zout = 0;

7026

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7027

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7028

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7029

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

7030

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7036

// |******************|

7037

// | cross_plane_pad |

7038

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7043

7044

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7045

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7046

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7047

7048

// Add offset due to the cross plane paddings

7049

zout *= (dst_cross_plane_pad * dst_stride_y);

7050

7051

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7052

// multiply dst_stride_z by DEPTH_GEMM3D

7053

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7054

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7055

// Add offset for batched GEMM

7056

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7057

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7058

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7059

// Multiply by the weight of matrix-matrix product and store the result

7060

#if defined(ALPHA)

7061

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

7062

#endif // defined(ALPHA)

7063

7064

#if defined(BETA)

7065

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7066

7067

#if defined(BROADCAST_BIAS)

7068

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7069

7070

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7071

7072

float8 bias_f0 = convert_float8(bias0);

7073

7074

#ifndef UNIT_BETA

7075

SCALE_BLOCK(1, float, bias_f, BETA);

7076

#endif // UNIT_BIAS

7077

7078

// acc = acc + bias[broadcasted]

7079

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

7080

7081

#else // defined(BROADCAST_BIAS)

7082

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7083

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7084

7085

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7086

7087

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7088

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7089

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7090

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7091

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7092

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7093

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7094

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7095

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7096

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7097

7098

#ifndef UNIT_BETA

7099

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

7104

7105

#endif // defined(BROADCAST_BIAS)

7106

#endif // defined(BETA)

7107

7108

half8 acc_h0 = convert_half8(acc0);

7109

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7110

half8 acc_h1 = convert_half8(acc1);

7111

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7112

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7113

half8 acc_h2 = convert_half8(acc2);

7114

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7115

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7116

half8 acc_h3 = convert_half8(acc3);

7117

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7118

7119

#if defined(ACTIVATION_TYPE)

7120

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

7121

#endif // defined(ACTIVATION_TYPE)

7122

7123

// Store the output block

7124

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7125

}

7126

7127

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

7128

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7129

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

7130

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

7131

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

7132

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

7133

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7134

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

7135

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7136

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7137

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

7138

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7139

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

7140

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7141

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

7142

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

7143

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

7144

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

7145

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7146

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

7147

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7148

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7149

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7150

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7151

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

7152

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

7153

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7154

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7155

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7156

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7157

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7158

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

7159

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

7160

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

7161

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

7162

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

7163

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7164

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

7165

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7166

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7167

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7168

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7169

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7170

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

7171

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7172

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7173

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7174

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

7175

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7176

*/

7177

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

7178

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7179

#if defined(BETA)

7180

IMAGE_DECLARATION(src2),

7181

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7182

IMAGE_DECLARATION(dst),

7183

uint src0_stride_z,

7184

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7185

#if defined(BETA)

7186

uint src2_stride_z,

7187

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7188

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7189

#if defined(REINTERPRET_INPUT_AS_3D)

7190

,

7191

uint src_cross_plane_pad

7192

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7193

#if defined(REINTERPRET_OUTPUT_AS_3D)

7194

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7195

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7196

#endif // REINTERPRET_OUTPUT_AS_3D

7197

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7198

{

7199

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

7200

7201

// Compute starting address for matrix A and Matrix B

7202

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

7203

7204

// Update address for the matrix A

7205

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

7206

7207

// Update address for the matrix B

7208

src_addr.s1 += idx * sizeof(half);

7209

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7210

#if defined(REINTERPRET_INPUT_AS_3D)

7211

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

7212

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7218

// |******************|

7219

// | cross_plane_pad |

7220

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7225

7226

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

7227

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7228

zin = min(DEPTH_GEMM3D - 1, zin);

7229

7230

// Add offset due to the cross plane paddings

7231

zin *= (src_cross_plane_pad * src0_stride_y);

7232

7233

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7234

// multiply src0_stride_z by DEPTH_GEMM3D

7235

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

7236

7237

#else // defined(REINTERPRET_INPUT_AS_3D)

7238

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7239

// Add offset for batched GEMM

7240

src_addr.s0 += get_global_id(2) * src0_stride_z;

7241

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7242

#endif // defined(REINTERPRET_INPUT_AS_3D)

7243

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7244

#if defined(MATRIX_B_DEPTH)

7245

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

7246

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

7247

#else // defined(MATRIX_B_DEPTH)

7248

src_addr.s1 += get_global_id(2) * src1_stride_z;

7249

#endif // defined(MATRIX_B_DEPTH)

7250

7251

half8 acc0 = 0.0h;

7252

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7253

half8 acc1 = 0.0h;

7254

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7255

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7256

half8 acc2 = 0.0h;

7257

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7258

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7259

half8 acc3 = 0.0h;

7260

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7261

7262

int i = 0;

7263

for(; i <= ((int)COLS_A - 4); i += 4)

7264

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7265

#if defined(REINTERPRET_INPUT_AS_3D)

7266

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7267

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

7268

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7269

// Load values from matrix A

7270

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7271

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7272

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7273

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7274

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7275

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7276

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7277

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7278

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7279

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7280

#endif // defined(REINTERPRET_INPUT_AS_3D)

7281

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7282

// Load values from matrix B

7283

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7284

src_addr.s1 += src1_stride_y;

7285

7286

// Accumulate

7287

acc0 = fma(b0, (half8)a0.s0, acc0);

7288

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7289

acc1 = fma(b0, (half8)a1.s0, acc1);

7290

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7291

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7292

acc2 = fma(b0, (half8)a2.s0, acc2);

7293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7295

acc3 = fma(b0, (half8)a3.s0, acc3);

7296

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7297

7298

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7299

src_addr.s1 += src1_stride_y;

7300

acc0 = fma(b0, (half8)a0.s1, acc0);

7301

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7302

acc1 = fma(b0, (half8)a1.s1, acc1);

7303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7305

acc2 = fma(b0, (half8)a2.s1, acc2);

7306

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7307

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7308

acc3 = fma(b0, (half8)a3.s1, acc3);

7309

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7310

7311

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7312

src_addr.s1 += src1_stride_y;

7313

acc0 = fma(b0, (half8)a0.s2, acc0);

7314

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7315

acc1 = fma(b0, (half8)a1.s2, acc1);

7316

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7317

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7318

acc2 = fma(b0, (half8)a2.s2, acc2);

7319

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7320

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7321

acc3 = fma(b0, (half8)a3.s2, acc3);

7322

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7323

7324

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7325

src_addr.s1 += src1_stride_y;

7326

acc0 = fma(b0, (half8)a0.s3, acc0);

7327

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7328

acc1 = fma(b0, (half8)a1.s3, acc1);

7329

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7330

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7331

acc2 = fma(b0, (half8)a2.s3, acc2);

7332

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7333

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7334

acc3 = fma(b0, (half8)a3.s3, acc3);

7335

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7336

7337

src_addr.s0 += 4 * sizeof(half);

7338

}

7339

7340

for(; i < (int)COLS_A; ++i)

7341

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7342

#if defined(REINTERPRET_INPUT_AS_3D)

7343

// Load values from matrix A

7344

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7345

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7346

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7347

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7348

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7349

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7350

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7351

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7352

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7353

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7354

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7355

// Load values from matrix A

7356

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7357

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7358

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7360

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7361

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7362

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7363

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7364

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7365

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7366

#endif // defined(REINTERPRET_INPUT_AS_3D)

7367

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7368

// Load values from matrix B

7369

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7370

7371

src_addr += (int2)(sizeof(half), src1_stride_y);

7372

7373

// Accumulate

7374

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

7375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7376

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

7377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7378

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7379

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

7380

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7382

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

7383

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7384

}

7385

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7386

int z = get_global_id(2);

7387

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7388

// Compute destination address

7389

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7390

7391

// Compute dst address

7392

__global uchar *dst_addr = offset(&dst, 0, 0);

7393

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7394

uint4 zout = 0;

7395

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7396

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7397

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7398

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7399

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7400

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

7405

// |******************|

7406

// | cross_plane_pad |

7407

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7412

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7413

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7414

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7415

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7416

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7417

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7418

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7419

7420

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7421

// multiply dst_stride_z by DEPTH_GEMM3D

7422

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7423

#else // defined(REINTERPRET_OUTPUT_AS_3D)

7424

// Add offset for batched GEMM

7425

dst_addr += z * dst_stride_z;

7426

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

7427

7428

// Multiply by the weight of matrix-matrix product and store the result

7429

#if defined(ALPHA)

7430

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

7431

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7436

7437

#if defined(BROADCAST_BIAS)

7438

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7439

7440

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7441

7442

#ifndef UNIT_BETA

7443

SCALE_BLOCK(1, half, bias, BETA);

7444

#endif // UNIT_BIAS

7445

7446

// acc = acc + bias[broadcasted]

7447

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

7448

7449

#else // defined(BROADCAST_BIAS)

7450

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7451

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7452

7453

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7454

7455

#ifndef UNIT_BETA

7456

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

7461

7462

#endif // defined(BROADCAST_BIAS)

7463

#endif // defined(BETA)

7464

7465

#if defined(ACTIVATION_TYPE)

7466

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

7467

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7468

7469

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7470

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7471

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

7472

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7473

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

7474

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7475

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7476

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7477

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7478

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7479

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7480

*

7481

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

7482

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7483

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7484

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7485

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7486

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7487

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7488

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7489

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7490

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7491

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7492

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7493

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7494

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7495

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7496

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7497

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7498

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

7499

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7500

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7501

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7502

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7503

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7504

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7505

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7506

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

7507

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7508

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7509

float4 c = vload4(0, (__global float *)src.ptr);

7510

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7511

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7512

float4 out = alpha_ab + (float4)BETA * c;

7513

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7514

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7515

vstore4(out, 0, (__global float *)dst.ptr);

7516

}

7517

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7518

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7519

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7520

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7521

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7522

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7523

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

7524

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7525

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7526

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7527

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7528

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7529

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7530

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7531

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7532

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7533

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7534

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7535

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7536

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7537

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7538

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7539

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7540

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

7541

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7542

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7543

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7544

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7545

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7546

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7547

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7548

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

7549

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7550

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7551

half8 c = vload8(0, (__global half *)src.ptr);

7552

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7553

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7554

half8 out = alpha_ab + (half8)BETA * c;

7555

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7556

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7557

vstore8(out, 0, (__global half *)dst.ptr);

7558

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7559

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7560

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7561

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7562

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7563

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

7564

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7565

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7566

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7567

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7568

*

7569

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

7570

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7571

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7572

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7573

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7574

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7575

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7576

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7577

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7578

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7579

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7580

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

7581

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

7582

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7583

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7584

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7585

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7586

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7587

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7588

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7589

*/

7590

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

7591

TENSOR3D_DECLARATION(src1),

7592

IMAGE_DECLARATION(dst))

7593

{

7594

int idx = get_global_id(0) * 4;

7595

int idy = get_global_id(1);

7596

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7597

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7598

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

7599

src_addr.s1 += idx * sizeof(float);

7600

7601

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

7605

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7606

{

7607

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

7608

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7609

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

7610

7611

acc += b0 * (float4)a0.s0;

7612

acc += b1 * (float4)a0.s1;

7613

}

7614

7615

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

7616

{

7617

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

7618

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7619

7620

acc += b0 * (float4)a0;

7621

}

7622

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7623

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7624

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7625

7626

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

7627

}

Anton Lokhmotov