Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2019-07-19 09:54:47 +0100

[diff] [blame]

49

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

50

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

51

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

52

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

53

* @note Only the following values for M0, K0 and V0 are supported:

54

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

55

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

56

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

57

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

58

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

59

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

60

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

61

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

62

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

63

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

64

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

65

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

66

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

67

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

68

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

69

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

70

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

71

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

72

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

73

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

74

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

75

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

76

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

77

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

78

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

79

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

80

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

81

*/

82

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

83

TENSOR3D_DECLARATION(dst)

84

#if defined(REINTERPRET_INPUT_AS_3D)

85

,

86

uint cross_plane_pad

87

#endif // REINTERPRET_INPUT_AS_3D

88

)

89

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

90

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

91

#define BLOCK_SIZE ((M0) * (K0))

92

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

93

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

#if defined(INTERLEAVE)

95

#define OUTPUT_OFFSET_X (K0)

96

#else // defined(INTERLEAVE)

97

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

98

#endif // defined(INTERLEAVE)

99

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

100

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

101

#if defined(INTERLEAVE)

102

#define OUTPUT_STEP_X (K0) * (V0)

103

#else // Do not interleave

104

#define OUTPUT_STEP_X (K0)

105

#endif // defined(INTERLEAVE)

106

107

// Compute source and destination addresses

108

uint x = get_global_id(0);

109

uint y = get_global_id(1);

110

uint z = get_global_id(2);

111

112

// ------------------ Compute input/output addresses ---------------------------

113

114

// Compute the input address

115

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

116

117

// Compute the output address

118

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

119

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

120

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

121

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

122

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

124

#if defined(REINTERPRET_INPUT_AS_3D)

125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

126

// multiply src_stride_z by DEPTH_GEMM3D

127

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

128

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

129

130

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

131

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

132

133

#else // defined(REINTERPRET_INPUT_AS_3D)

134

135

input_ptr += z * (uint)src_stride_z;

136

137

#endif // defined(REINTERPRET_INPUT_AS_3D)

138

139

// Add offset for batched GEMM

140

output_ptr += z * (uint)dst_stride_z;

141

142

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

143

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

144

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

145

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

146

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

147

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

148

#endif // M0 > 1

149

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

150

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

151

#endif // M0 > 2

152

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

153

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

154

#endif // M0 > 3

155

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

156

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

#endif // M0 > 4

158

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

159

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

160

#endif // M0 > 5

161

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

162

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

163

#endif // M0 > 6

164

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

165

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

166

#endif // M0 > 7

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

167

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

168

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

169

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

170

171

#undef BLOCK_SIZE

172

#undef OUTPUT_OFFSET_X

173

#undef OUTPUT_STEP_X

174

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

175

176

#if M0 == 2

177

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

178

({ \

179

VEC_DATA_TYPE(DATA_TYPE, M0) \

180

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

181

VSTORE(M0) \

182

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

183

})

184

#elif M0 == 3 // M0 == 3

185

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

186

({ \

187

VEC_DATA_TYPE(DATA_TYPE, M0) \

188

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

189

VSTORE(M0) \

190

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

191

})

192

#elif M0 == 4 // M0 == 4

193

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

194

({ \

195

VEC_DATA_TYPE(DATA_TYPE, M0) \

196

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

197

VSTORE(M0) \

198

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

199

})

200

#elif M0 == 5 // M0 == 5

201

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

202

({ \

203

VEC_DATA_TYPE(DATA_TYPE, 4) \

204

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

205

DATA_TYPE res1 = a4.s##i; \

206

VSTORE(4) \

207

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

208

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

209

})

210

#elif M0 == 6 // M0 == 6

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

VEC_DATA_TYPE(DATA_TYPE, 2) \

216

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

217

VSTORE(4) \

218

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

219

VSTORE(2) \

220

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

221

})

222

#elif M0 == 7 // M0 == 7

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 3) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(3) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 8 // M0 == 8

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, M0) \

238

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

239

VSTORE(M0) \

240

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

})

242

#else // M0 not supported

243

#error "M0 value not supported"

244

#endif // N0 conditions

245

246

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

247

* the output matrix unrolling the values.

248

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

249

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

250

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

251

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

252

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

253

* @note Only the following values for M0, K0 and V0 are supported:

254

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

255

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

256

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

257

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

258

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

259

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

260

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

261

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

262

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

263

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

264

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

265

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

272

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

273

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

278

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

280

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

281

*/

282

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

283

TENSOR3D_DECLARATION(dst)

284

#if defined(REINTERPRET_INPUT_AS_3D)

285

,

286

uint cross_plane_pad

287

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

292

293

// Output offset X

294

#if defined(INTERLEAVE)

295

#define OUTPUT_OFFSET_X (M0)

296

#else // defined(INTERLEAVE)

297

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

298

#endif // defined(INTERLEAVE)

299

300

// Output step X

301

#if defined(INTERLEAVE)

302

#define OUTPUT_STEP_X (M0) * (V0)

303

#else // Do not interleave

304

#define OUTPUT_STEP_X (M0)

305

#endif // defined(INTERLEAVE)

306

307

// Compute source and destination addresses

308

uint x = get_global_id(0);

309

uint y = get_global_id(1);

310

uint z = get_global_id(2);

311

312

// ------------------ Compute input/output addresses ---------------------------

313

314

// Compute the input address

315

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

316

317

// Compute the output address

318

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

319

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

320

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

321

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

322

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

323

324

#if defined(REINTERPRET_INPUT_AS_3D)

325

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

326

// multiply src_stride_z by DEPTH_GEMM3D

327

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

328

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

329

330

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

331

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

332

333

#else // defined(REINTERPRET_INPUT_AS_3D)

334

335

input_ptr += z * (uint)src_stride_z;

336

337

#endif // defined(REINTERPRET_INPUT_AS_3D)

338

339

// Add offset for batched GEMM

340

output_ptr += z * (uint)dst_stride_z;

341

342

// ---------------------------Load input values --------------------------------

343

344

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

345

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

346

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

347

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

348

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

349

#endif // M0 > 1

350

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

351

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

352

#endif // M0 > 2

353

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

354

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

355

#endif // M0 > 3

356

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

357

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

358

#endif // M0 > 4

359

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

360

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

361

#endif // M0 > 5

362

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

363

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

364

#endif // M0 > 6

365

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

366

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

367

#endif // M0 > 7

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

368

// ---------------------------Transpose and store block -----------------------

369

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

372

#if K0 > 2

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

374

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

375

#if K0 > 3

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

377

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

378

#if K0 > 4

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

383

#endif // K0 > 4

384

#if K0 > 8

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

387

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

388

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

389

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

390

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

391

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

392

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

397

#undef OUTPUT_STEP_X

398

}

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

399

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

400

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

402

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

403

* the output matrix unrolling the values.

404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

405

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

407

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

408

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

409

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

410

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

411

* N0: 2,3,4,8,16

412

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

413

* H0: greater than 0

414

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

415

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

416

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

423

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

424

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

429

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

431

*/

432

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

433

TENSOR3D_DECLARATION(dst))

434

{

435

// Block size

436

#define BLOCK_SIZE ((K0) * (N0))

437

438

// Output offset X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_OFFSET_X (N0)

441

#else // defined(INTERLEAVE)

442

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

443

#endif // defined(INTERLEAVE)

444

445

// Output step X

446

#if defined(INTERLEAVE)

447

#define OUTPUT_STEP_X (N0) * (H0)

448

#else // Do not interleave

449

#define OUTPUT_STEP_X (N0)

450

#endif // defined(INTERLEAVE)

451

452

// Compute source and destination addresses

453

uint x = get_global_id(0);

454

uint y = get_global_id(1);

455

uint z = get_global_id(2);

456

457

// ------------------ Compute input/output addresses ---------------------------

458

459

// Compute the input address

460

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

461

462

// Compute the output address

463

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

464

x / (uint)H0)

465

* (uint)dst_stride_y)

466

+ z * (uint)dst_stride_z;

467

468

// ---------------------------Load input values --------------------------------

469

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

470

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

471

472

// Load values from the RHS matrix

473

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

474

#if K0 > 1

475

if(y * (uint)K0 + 1 < SRC_HEIGHT)

476

{

477

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

482

{

483

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

484

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

485

#endif // K0 > 2

486

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

487

if(y * (uint)K0 + 3 < SRC_HEIGHT)

488

{

489

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

490

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

491

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

492

#if K0 > 4

493

if(y * (uint)K0 + 4 < SRC_HEIGHT)

494

{

495

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

496

}

497

if(y * (uint)K0 + 5 < SRC_HEIGHT)

498

{

499

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

500

}

501

if(y * (uint)K0 + 6 < SRC_HEIGHT)

502

{

503

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

504

}

505

if(y * (uint)K0 + 7 < SRC_HEIGHT)

506

{

507

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

508

}

509

#endif // K0 > 4

510

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

511

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

512

{

513

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

514

}

515

if(y * (uint)K0 + 9 < SRC_HEIGHT)

516

{

517

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

518

}

519

if(y * (uint)K0 + 10 < SRC_HEIGHT)

520

{

521

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

522

}

523

if(y * (uint)K0 + 11 < SRC_HEIGHT)

524

{

525

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

526

}

527

if(y * (uint)K0 + 12 < SRC_HEIGHT)

528

{

529

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

530

}

531

if(y * (uint)K0 + 13 < SRC_HEIGHT)

532

{

533

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

534

}

535

if(y * (uint)K0 + 14 < SRC_HEIGHT)

536

{

537

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

538

}

539

if(y * (uint)K0 + 15 < SRC_HEIGHT)

540

{

541

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

546

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

547

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

548

549

#undef BLOCK_SIZE

550

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

555

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

556

* the output matrix unrolling the values.

557

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

558

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

559

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

560

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

561

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

563

* @note The option -DTRANSPOSE must passed at compile time.

564

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

565

* N0: 2,3,4,8,16

566

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

567

* H0: greater than 0

568

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

569

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

570

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

571

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

573

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

577

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

582

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

583

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

584

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

585

*/

586

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

587

TENSOR3D_DECLARATION(dst))

588

{

589

// Block size

590

#define BLOCK_SIZE ((K0) * (N0))

591

592

// Output offset X

593

#if defined(INTERLEAVE)

594

#define OUTPUT_OFFSET_X (K0)

595

#else // defined(INTERLEAVE)

596

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

597

#endif // defined(INTERLEAVE)

598

599

// Output step X

600

#if defined(INTERLEAVE)

601

#define OUTPUT_STEP_X (K0) * (H0)

602

#else // Do not interleave

603

#define OUTPUT_STEP_X (K0)

604

#endif // defined(INTERLEAVE)

605

606

// Compute source and destination addresses

607

uint x = get_global_id(0);

608

uint y = get_global_id(1);

609

uint z = get_global_id(2);

610

611

// ------------------ Compute input/output addresses ---------------------------

612

613

// Compute the input address

614

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

615

616

// Compute the output address

617

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

618

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

619

620

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

621

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

623

// Load values from the RHS matrix

624

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

625

if(y * (uint)K0 + 1 < SRC_HEIGHT)

626

{

627

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

628

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

629

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

630

if(y * (uint)K0 + 2 < SRC_HEIGHT)

631

{

632

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

633

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

634

#endif // K0 > 2

635

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

636

if(y * (uint)K0 + 3 < SRC_HEIGHT)

637

{

638

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

639

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

640

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

641

#if K0 > 4

642

if(y * (uint)K0 + 4 < SRC_HEIGHT)

643

{

644

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

645

}

646

if(y * (uint)K0 + 5 < SRC_HEIGHT)

647

{

648

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

649

}

650

if(y * (uint)K0 + 6 < SRC_HEIGHT)

651

{

652

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

653

}

654

if(y * (uint)K0 + 7 < SRC_HEIGHT)

655

{

656

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

657

}

658

#endif // K0 > 4

659

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

660

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

661

{

662

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

663

}

664

if(y * (uint)K0 + 9 < SRC_HEIGHT)

665

{

666

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

667

}

668

if(y * (uint)K0 + 10 < SRC_HEIGHT)

669

{

670

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

671

}

672

if(y * (uint)K0 + 11 < SRC_HEIGHT)

673

{

674

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

675

}

676

if(y * (uint)K0 + 12 < SRC_HEIGHT)

677

{

678

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

679

}

680

if(y * (uint)K0 + 13 < SRC_HEIGHT)

681

{

682

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

683

}

684

if(y * (uint)K0 + 14 < SRC_HEIGHT)

685

{

686

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

687

}

688

if(y * (uint)K0 + 15 < SRC_HEIGHT)

689

{

690

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

695

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

696

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

697

#if K0 == 2

698

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

704

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

705

#if N0 > 2

706

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

707

#endif // N0 > 2

708

#if N0 > 3

709

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

710

#endif // N0 > 3

711

#if N0 > 4

712

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

713

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

714

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

715

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

716

#endif // N0 > 4

717

#if N0 > 8

718

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

719

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

720

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

721

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

722

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

723

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

724

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

725

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

726

#endif // N0 > 8

727

728

#elif K0 == 3 // K0 == 2

729

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

734

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

735

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

736

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

737

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

738

#endif // N0 > 2

739

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

740

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

741

#endif // N0 > 3

742

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

743

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

744

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

745

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

746

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

747

#endif // N0 > 4

748

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

749

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

750

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

751

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

752

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

753

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

754

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

755

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

756

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

757

#endif // N0 > 8

758

759

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

760

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

766

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

767

#if N0 > 2

768

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

769

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

770

#if N0 > 3

771

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

772

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

773

#if N0 > 4

774

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

775

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

776

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

777

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

778

#endif // N0 > 4

779

#if N0 > 8

780

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

781

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

782

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

783

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

784

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

785

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

786

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

787

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

788

#endif // N0 > 8

789

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

790

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

791

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

796

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

797

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

798

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

799

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

800

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

801

#if N0 > 3

802

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

803

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

804

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

805

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

806

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

807

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

808

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

809

#endif // N0 > 4

810

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

811

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

812

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

813

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

814

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

815

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

816

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

817

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

818

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

819

#endif // N0 > 8

820

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

821

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

822

823

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

829

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

830

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

831

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

832

#if N0 > 2

833

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

834

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

835

#endif // N0 > 2

836

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

837

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

838

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

839

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

840

#if N0 > 4

841

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

842

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

843

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

844

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

845

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

846

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

847

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

848

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

849

#endif // N0 > 4

850

#if N0 > 8

851

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

852

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

853

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

854

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

855

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

856

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

857

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

858

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

859

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

860

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

861

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

862

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

863

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

864

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

865

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

866

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

871

#endif // N0 > 2

872

873

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

874

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

875

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

876

877

#undef BLOCK_SIZE

878

#undef OUTPUT_OFFSET_X

879

#undef OUTPUT_STEP_X

880

}

881

#endif // defined(TRANSPOSE)

882

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

883

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

884

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

885

886

#define CONCAT(a, b) a##b

887

888

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

893

({ \

894

c = fma(a.s0, b.s0, c); \

895

c = fma(a.s1, b.s1, c); \

896

})

897

#define ARM_DOT3(a, b, c) \

898

({ \

899

ARM_DOT2(a, b, c); \

900

c = fma((a.s2), (b.s2), c); \

901

})

902

#define ARM_DOT4(a, b, c) \

903

({ \

904

ARM_DOT3(a, b, c); \

905

c = fma((a.s3), (b.s3), c); \

906

})

907

#define ARM_DOT8(a, b, c) \

908

({ \

909

ARM_DOT4((a.lo), (b.lo), c); \

910

ARM_DOT4((a.hi), (b.hi), c); \

911

})

912

#define ARM_DOT16(a, b, c) \

913

({ \

914

ARM_DOT8((a.lo), (b.lo), c); \

915

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

})

926

#elif N0 == 3 // N0 == 3

927

#define ARM_DOT_K0XN0(k0, a, b, c) \

928

({ \

929

CONCAT(ARM_DOT, k0) \

930

((a), (b##0), (c.s0)); \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##1), (c.s1)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##2), (c.s2)); \

935

})

936

#elif N0 == 4 // N0 == 4

937

#define ARM_DOT_K0XN0(k0, a, b, c) \

938

({ \

939

CONCAT(ARM_DOT, k0) \

940

((a), (b##0), (c.s0)); \

941

CONCAT(ARM_DOT, k0) \

942

((a), (b##1), (c.s1)); \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##2), (c.s2)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##3), (c.s3)); \

947

})

948

#elif N0 == 8 // N0 == 8

949

#define ARM_DOT_K0XN0(k0, a, b, c) \

950

({ \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##0), (c.s0)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##1), (c.s1)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##2), (c.s2)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##3), (c.s3)); \

959

CONCAT(ARM_DOT, k0) \

960

((a), (b##4), (c.s4)); \

961

CONCAT(ARM_DOT, k0) \

962

((a), (b##5), (c.s5)); \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##6), (c.s6)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##7), (c.s7)); \

967

})

968

#elif N0 == 16 // N0 == 16

969

#define ARM_DOT_K0XN0(k0, a, b, c) \

970

({ \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##0), (c.s0)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##1), (c.s1)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##2), (c.s2)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##3), (c.s3)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##4), (c.s4)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##5), (c.s5)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##6), (c.s6)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##7), (c.s7)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##8), (c.s8)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##9), (c.s9)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##A), (c.sA)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##B), (c.sB)); \

995

CONCAT(ARM_DOT, k0) \

996

((a), (b##C), (c.sC)); \

997

CONCAT(ARM_DOT, k0) \

998

((a), (b##D), (c.sD)); \

999

CONCAT(ARM_DOT, k0) \

1000

((a), (b##E), (c.sE)); \

1001

CONCAT(ARM_DOT, k0) \

1002

((a), (b##F), (c.sF)); \

1003

})

1004

#else // N0 not supported

1005

#error "N0 value not supported"

1006

#endif // N0 conditions

1007

1008

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1009

* The LHS matrix is NOT reshaped

1010

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1011

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1012

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1013

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1014

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1015

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1017

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1019

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1020

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1021

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1022

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1023

* - N0 = 2, 3, 4, 8, 16

1024

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1025

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1026

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1027

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1028

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1029

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1030

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1031

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1032

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1033

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1034

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1035

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1036

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1037

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1038

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1039

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1040

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1041

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1042

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1043

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1044

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1045

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1046

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1047

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1048

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1049

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1050

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1051

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1052

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1053

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1054

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1055

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1056

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1057

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1058

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1059

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1060

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1062

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1063

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1064

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1065

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1066

*/

1067

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1068

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1069

#if defined(BETA)

1070

IMAGE_DECLARATION(bias),

1071

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1072

IMAGE_DECLARATION(dst),

1073

uint lhs_stride_z,

1074

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1075

#if defined(BETA)

1076

uint bias_stride_z,

1077

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1078

uint dst_stride_z

1079

#if defined(REINTERPRET_INPUT_AS_3D)

1080

,

1081

uint lhs_cross_plane_pad

1082

#endif // REINTERPRET_INPUT_AS_3D

1083

#if defined(REINTERPRET_OUTPUT_AS_3D)

1084

,

1085

uint dst_cross_plane_pad

1086

#endif // REINTERPRET_OUTPUT_AS_3D

1087

)

1088

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1089

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1090

#define RHS_BLOCK_SIZE ((K0) * (N0))

1091

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1092

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1093

#if defined(RHS_INTERLEAVE)

1094

#define RHS_OFFSET_X (K0)

1095

#define RHS_STEP_X ((K0) * (H0))

1096

#define RHS_STEP_LOOP (1)

1097

#else // defined(RHS_INTERLEAVE)

1098

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1099

#define RHS_STEP_X (K0)

1100

#define RHS_STEP_LOOP (H0)

1101

#endif // defined(RHS_INTERLEAVE)

1102

1103

uint x = get_global_id(0);

1104

uint y = get_global_id(1);

1105

uint z = get_global_id(2);

1106

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1107

#if defined(DUMMY_WORK_ITEMS)

1108

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1113

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1114

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1115

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1116

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1117

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1118

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1119

1120

#if defined(MATRIX_B_DEPTH)

1121

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1122

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1123

#else // defined(MATRIX_B_DEPTH)

1124

rhs_offset += z * rhs_stride_z;

1125

#endif // defined(MATRIX_B_DEPTH)

1126

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1127

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1128

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1129

1130

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1131

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1132

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1133

1134

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1135

// multiply lhs_stride_z by DEPTH_GEMM3D

1136

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1137

1138

#else // defined(REINTERPRET_INPUT_AS_3D)

1139

1140

// Add offset for batched GEMM

1141

lhs_offset += z * lhs_stride_z;

1142

1143

#endif // defined(REINTERPRET_INPUT_AS_3D)

1144

1145

// Initialize the accumulators

1146

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1147

1148

int i = 0;

1149

for(; i <= (K - K0); i += K0)

1150

{

1151

// Supported cases (M0, K0):

1152

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1153

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1154

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1155

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1156

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1157

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1158

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1159

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1160

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1161

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1162

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1163

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1164

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1165

1166

// Accumulate

1167

ARM_DOT_K0XN0(K0, a0, b, c0);

1168

#if M0 > 1

1169

ARM_DOT_K0XN0(K0, a1, b, c1);

1170

#endif // M0 > 1

1171

#if M0 > 2

1172

ARM_DOT_K0XN0(K0, a2, b, c2);

1173

#endif // M0 > 2

1174

#if M0 > 3

1175

ARM_DOT_K0XN0(K0, a3, b, c3);

1176

#endif // M0 > 3

1177

#if M0 > 4

1178

ARM_DOT_K0XN0(K0, a4, b, c4);

1179

#endif // M0 > 4

1180

#if M0 > 5

1181

ARM_DOT_K0XN0(K0, a5, b, c5);

1182

#endif // M0 > 5

1183

#if M0 > 6

1184

ARM_DOT_K0XN0(K0, a6, b, c6);

1185

#endif // M0 > 6

1186

#if M0 > 7

1187

ARM_DOT_K0XN0(K0, a7, b, c7);

1188

#endif // M0 > 7

1189

1190

lhs_offset += K0 * sizeof(DATA_TYPE);

1191

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1192

}

1193

1194

// Left-over accumulations

1195

for(; i < K; ++i)

1196

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1197

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1198

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1199

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1200

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1201

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1202

1203

// Accumulate

1204

ARM_DOT_K0XN0(1, a0, b, c0);

1205

#if M0 > 1

1206

ARM_DOT_K0XN0(1, a1, b, c1);

1207

#endif // M0 > 1

1208

#if M0 > 2

1209

ARM_DOT_K0XN0(1, a2, b, c2);

1210

#endif // M0 > 2

1211

#if M0 > 3

1212

ARM_DOT_K0XN0(1, a3, b, c3);

1213

#endif // M0 > 3

1214

#if M0 > 4

1215

ARM_DOT_K0XN0(1, a4, b, c4);

1216

#endif // M0 > 4

1217

#if M0 > 5

1218

ARM_DOT_K0XN0(1, a5, b, c5);

1219

#endif // M0 > 5

1220

#if M0 > 6

1221

ARM_DOT_K0XN0(1, a6, b, c6);

1222

#endif // M0 > 6

1223

#if M0 > 7

1224

ARM_DOT_K0XN0(1, a7, b, c7);

1225

#endif // M0 > 7

1226

1227

lhs_offset += sizeof(DATA_TYPE);

1228

rhs_offset += sizeof(DATA_TYPE);

1229

}

1230

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1231

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1232

1233

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1234

1235

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1236

1237

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1238

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1239

1240

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1241

// multiply dst_stride_z by DEPTH_GEMM3D

1242

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1243

1244

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1245

1246

// Add offset for batched GEMM

1247

dst_addr += z * dst_stride_z;

1248

1249

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1250

1251

// Multiply by the weight of matrix-matrix product and store the result

1252

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1253

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1254

#endif // defined(ALPHA)

1255

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1256

// Add beta*bias

1257

#if defined(BETA)

1258

#if defined(BROADCAST_BIAS)

1259

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1260

1261

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1262

1263

#ifndef UNIT_BETA

1264

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1265

#endif // UNIT_BIAS

1266

1267

// c = c + bias[broadcasted]

1268

ADD_BLOCK_BROADCAST(M0, c, bias0);

1269

1270

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1271

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1272

1273

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1274

1275

#ifndef UNIT_BETA

1276

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1281

1282

#endif // defined(BROADCAST_BIAS)

1283

#endif // defined(BETA)

1284

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1285

#if defined(ACTIVATION_TYPE)

1286

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1287

#endif // defined(ACTIVATION_TYPE)

1288

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1289

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1290

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1291

1292

#undef RHS_BLOCK_SIZE

1293

#undef RHS_OFFSET_X

1294

#undef RHS_STEP_X

1295

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1296

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1297

#if defined(OPENCL_IMAGE_SUPPORT)

1298

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1299

* The LHS matrix is NOT reshaped

1300

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1301

*

1302

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1303

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1304

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1305

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1306

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1307

* could be different from the value returned by get_image_height(rhs_img).

1308

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1309

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1310

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1311

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1312

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1313

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1314

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1315

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1321

* The activation function is performed after the bias addition

1322

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1323

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1324

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1325

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1326

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1327

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1328

*

1329

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1330

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1331

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1332

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1333

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1334

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1335

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1336

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1337

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1338

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1339

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1340

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1341

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1342

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1343

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1344

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1345

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1346

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1347

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1348

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1349

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1350

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1351

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1352

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1353

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1354

*/

1355

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1356

__read_only image2d_t rhs_img,

1357

#if defined(BETA)

1358

IMAGE_DECLARATION(bias),

1359

#endif // defined(BETA)

1360

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1366

uint dst_stride_z

1367

#if defined(REINTERPRET_INPUT_AS_3D)

1368

,

1369

uint lhs_cross_plane_pad

1370

#endif // REINTERPRET_INPUT_AS_3D

1371

#if defined(REINTERPRET_OUTPUT_AS_3D)

1372

,

1373

uint dst_cross_plane_pad

1374

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1379

1380

#define LEFTOVER_K (K % K0)

1381

1382

// Block size

1383

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1384

1385

// RHS offset and step X

1386

#if defined(RHS_INTERLEAVE)

1387

#define RHS_OFFSET_X (PIXEL_UNIT)

1388

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1389

#define RHS_STEP_LOOP (1)

1390

#else // defined(RHS_INTERLEAVE)

1391

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1392

#define RHS_STEP_X PIXEL_UNIT

1393

#define RHS_STEP_LOOP (H0)

1394

#endif // defined(RHS_INTERLEAVE)

1395

1396

uint x = get_global_id(0);

1397

uint y = get_global_id(1);

1398

uint z = get_global_id(2);

1399

1400

#if defined(DUMMY_WORK_ITEMS)

1401

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1406

1407

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1408

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1409

1410

#if defined(MATRIX_B_DEPTH)

1411

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1412

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1413

#else // defined(MATRIX_B_DEPTH)

1414

const uint z_rhs = get_global_id(2);

1415

#endif // defined(MATRIX_B_DEPTH)

1416

1417

// Compute RHS matrix coordinates

1418

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1419

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1420

1421

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1422

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1423

1424

#if defined(REINTERPRET_INPUT_AS_3D)

1425

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1426

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1427

1428

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1429

// multiply lhs_stride_z by DEPTH_GEMM3D

1430

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1431

1432

#else // defined(REINTERPRET_INPUT_AS_3D)

1433

1434

// Add offset for batched GEMM

1435

lhs_offset += z * lhs_stride_z;

1436

1437

#endif // defined(REINTERPRET_INPUT_AS_3D)

1438

1439

// Initialize the accumulators

1440

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1441

1442

int i = 0;

1443

for(; i <= (K - K0); i += K0)

1444

{

1445

// Load values from LHS matrix

1446

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1447

1448

// Load values from RHS matrix stored in a cl_image

1449

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1450

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1451

1452

// Accumulate

1453

ARM_DOT_K0XN0(K0, a0, b, c0);

1454

#if M0 > 1

1455

ARM_DOT_K0XN0(K0, a1, b, c1);

1456

#endif // M0 > 1

1457

#if M0 > 2

1458

ARM_DOT_K0XN0(K0, a2, b, c2);

1459

#endif // M0 > 2

1460

#if M0 > 3

1461

ARM_DOT_K0XN0(K0, a3, b, c3);

1462

#endif // M0 > 3

1463

#if M0 > 4

1464

ARM_DOT_K0XN0(K0, a4, b, c4);

1465

#endif // M0 > 4

1466

#if M0 > 5

1467

ARM_DOT_K0XN0(K0, a5, b, c5);

1468

#endif // M0 > 5

1469

#if M0 > 6

1470

ARM_DOT_K0XN0(K0, a6, b, c6);

1471

#endif // M0 > 6

1472

#if M0 > 7

1473

ARM_DOT_K0XN0(K0, a7, b, c7);

1474

#endif // M0 > 7

1475

1476

lhs_offset += K0 * sizeof(DATA_TYPE);

1477

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1482

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1492

#if M0 > 1

1493

union UNION_VEC_TYPE a1 = {.v = 0 };

1494

#endif // M0 > 1

1495

#if M0 > 2

1496

union UNION_VEC_TYPE a2 = {.v = 0 };

1497

#endif // M0 > 2

1498

#if M0 > 3

1499

union UNION_VEC_TYPE a3 = {.v = 0 };

1500

#endif // M0 > 3

1501

#if M0 > 4

1502

union UNION_VEC_TYPE a4 = {.v = 0 };

1503

#endif // M0 > 4

1504

#if M0 > 5

1505

union UNION_VEC_TYPE a5 = {.v = 0 };

1506

#endif // M0 > 5

1507

#if M0 > 6

1508

union UNION_VEC_TYPE a6 = {.v = 0 };

1509

#endif // M0 > 6

1510

#if M0 > 7

1511

union UNION_VEC_TYPE a7 = {.v = 0 };

1512

#endif // M0 > 7

1513

1514

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1515

1516

// Load from RHS matrix

1517

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1518

1519

// Load from LHS matrix

1520

for(int k = 0; k < LEFTOVER_K; ++k)

1521

{

1522

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1523

#if M0 > 1

1524

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1525

#endif // M0 > 1

1526

#if M0 > 2

1527

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1528

#endif // M0 > 2

1529

#if M0 > 3

1530

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1531

#endif // M0 > 3

1532

#if M0 > 4

1533

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1534

#endif // M0 > 4

1535

#if M0 > 5

1536

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1537

#endif // M0 > 5

1538

#if M0 > 6

1539

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1540

#endif // M0 > 6

1541

#if M0 > 7

1542

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1543

#endif // M0 > 7

1544

1545

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1550

#if M0 > 1

1551

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1552

#endif // M0 > 1

1553

#if M0 > 2

1554

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1555

#endif // M0 > 2

1556

#if M0 > 3

1557

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1558

#endif // M0 > 3

1559

#if M0 > 4

1560

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1561

#endif // M0 > 4

1562

#if M0 > 5

1563

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1564

#endif // M0 > 5

1565

#if M0 > 6

1566

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1567

#endif // M0 > 6

1568

#if M0 > 7

1569

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1570

#endif // M0 > 7

1571

1572

#endif // LEFTOVER_K != 0

1573

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1574

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1575

1576

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1577

1578

#if defined(REINTERPRET_OUTPUT_AS_3D)

1579

1580

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1581

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1582

1583

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1584

// multiply dst_stride_z by DEPTH_GEMM3D

1585

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1586

1587

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1588

1589

// Add offset for batched GEMM

1590

dst_addr += z * dst_stride_z;

1591

1592

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1593

1594

// Multiply by the weight of matrix-matrix product and store the result

1595

#if defined(ALPHA)

1596

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1597

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1602

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1603

1604

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1605

1606

#ifndef UNIT_BETA

1607

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1608

#endif // UNIT_BIAS

1609

1610

// c = c + bias[broadcasted]

1611

ADD_BLOCK_BROADCAST(M0, c, bias0);

1612

1613

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1614

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1615

1616

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1617

1618

#ifndef UNIT_BETA

1619

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1624

1625

#endif // defined(BROADCAST_BIAS)

1626

#endif // defined(BETA)

1627

1628

#if defined(ACTIVATION_TYPE)

1629

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1630

#endif // defined(ACTIVATION_TYPE)

1631

1632

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1633

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1634

1635

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1642

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1643

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1649

#define VFMA_M0xN0(i, a, b, c) \

1650

({ \

1651

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1652

})

1653

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1654

#define VFMA_M0xN0(i, a, b, c) \

1655

({ \

1656

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1657

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1658

})

1659

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1660

#define VFMA_M0xN0(i, a, b, c) \

1661

({ \

1662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1663

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1664

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1665

})

1666

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1667

#define VFMA_M0xN0(i, a, b, c) \

1668

({ \

1669

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1670

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1671

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1672

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1673

})

1674

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1675

#define VFMA_M0xN0(i, a, b, c) \

1676

({ \

1677

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1678

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1680

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1681

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1682

})

1683

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1684

#define VFMA_M0xN0(i, a, b, c) \

1685

({ \

1686

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1687

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1690

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1691

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1692

})

1693

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1694

#define VFMA_M0xN0(i, a, b, c) \

1695

({ \

1696

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1697

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1698

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1701

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1702

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1703

})

1704

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1705

#define VFMA_M0xN0(i, a, b, c) \

1706

({ \

1707

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1708

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1709

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1710

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1711

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1712

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1713

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1714

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1715

})

1716

#else // M0 not supported

1717

#error "M0 not supported"

1718

#endif // M0 not supported

1719

1720

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1721

* The LHS matrix is NOT reshaped

1722

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1723

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1724

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1725

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1726

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1727

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1728

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1729

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1730

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1731

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1732

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1733

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1734

* - N0 = 2, 3, 4, 8, 16

1735

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1736

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1737

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1738

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1739

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1740

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1741

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1742

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1743

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1744

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1745

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1746

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1747

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1748

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1749

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1750

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1751

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1752

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1753

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1754

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1755

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1756

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1757

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1758

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1759

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1760

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1761

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1762

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1763

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1764

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1765

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1766

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1767

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1768

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1769

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1770

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1771

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1772

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1773

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1774

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1775

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1776

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1777

*/

1778

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1779

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1780

#if defined(BETA)

1781

IMAGE_DECLARATION(bias),

1782

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1783

IMAGE_DECLARATION(dst),

1784

uint lhs_stride_z,

1785

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1786

#if defined(BETA)

1787

uint bias_stride_z,

1788

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1789

uint dst_stride_z

1790

#if defined(REINTERPRET_INPUT_AS_3D)

1791

,

1792

uint lhs_cross_plane_pad

1793

#endif // REINTERPRET_INPUT_AS_3D

1794

#if defined(REINTERPRET_OUTPUT_AS_3D)

1795

,

1796

uint dst_cross_plane_pad

1797

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1802

1803

// RHS offset and step X

1804

#if defined(RHS_INTERLEAVE)

1805

#define RHS_OFFSET_X (N0)

1806

#define RHS_STEP_X ((N0) * (H0))

1807

#define RHS_STEP_LOOP (1)

1808

#else // defined(RHS_INTERLEAVE)

1809

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1810

#define RHS_STEP_X (N0)

1811

#define RHS_STEP_LOOP (H0)

1812

#endif // defined(RHS_INTERLEAVE)

1813

1814

uint x = get_global_id(0);

1815

uint y = get_global_id(1);

1816

uint z = get_global_id(2);

1817

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1818

#if defined(DUMMY_WORK_ITEMS)

1819

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1824

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1825

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1826

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1827

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1828

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1829

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1830

1831

#if defined(MATRIX_B_DEPTH)

1832

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1833

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1834

#else // defined(MATRIX_B_DEPTH)

1835

rhs_offset += z * rhs_stride_z;

1836

#endif // defined(MATRIX_B_DEPTH)

1837

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1838

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1839

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1840

1841

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1842

1843

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1844

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1845

1846

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1847

// multiply lhs_stride_z by DEPTH_GEMM3D

1848

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1849

1850

#else // defined(REINTERPRET_INPUT_AS_3D)

1851

1852

// Add offset for batched GEMM

1853

lhs_offset += z * lhs_stride_z;

1854

1855

#endif // defined(REINTERPRET_INPUT_AS_3D)

1856

1857

// Initialize the accumulators

1858

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1859

1860

int i = 0;

1861

for(; i <= (K - K0); i += K0)

1862

{

1863

// Supported cases (M0, K0):

1864

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1865

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1866

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1867

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1868

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1869

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1870

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1871

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1872

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1873

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1874

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1875

VEC_DATA_TYPE(DATA_TYPE, N0)

1876

b0;

1877

1878

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1879

VFMA_M0xN0(0, a, b0, c);

1880

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1881

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1882

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1883

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1884

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1885

#endif // K0 > 2

1886

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1887

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1888

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1889

#endif // K0 > 3

1890

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1891

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1892

VFMA_M0xN0(4, a, b0, c);

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(5, a, b0, c);

1895

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1896

VFMA_M0xN0(6, a, b0, c);

1897

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1898

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1899

#endif // K0 > 4

1900

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1901

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1902

VFMA_M0xN0(8, a, b0, c);

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(9, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(A, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(B, a, b0, c);

1909

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1910

VFMA_M0xN0(C, a, b0, c);

1911

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1912

VFMA_M0xN0(D, a, b0, c);

1913

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1914

VFMA_M0xN0(E, a, b0, c);

1915

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1916

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1917

#endif // K0 > 8

1918

1919

lhs_offset += K0 * sizeof(DATA_TYPE);

1920

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1921

}

1922

1923

// Left-over accumulations

1924

for(; i < K; ++i)

1925

{

1926

// Load values from LHS matrix

1927

VEC_DATA_TYPE(DATA_TYPE, 2)

1928

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1929

#if M0 > 1

1930

VEC_DATA_TYPE(DATA_TYPE, 2)

1931

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1932

#endif // M0 > 1

1933

#if M0 > 2

1934

VEC_DATA_TYPE(DATA_TYPE, 2)

1935

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1936

#endif // M0 > 2

1937

#if M0 > 3

1938

VEC_DATA_TYPE(DATA_TYPE, 2)

1939

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1940

#endif // M0 > 3

1941

#if M0 > 4

1942

VEC_DATA_TYPE(DATA_TYPE, 2)

1943

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1944

#endif // M0 > 4

1945

#if M0 > 5

1946

VEC_DATA_TYPE(DATA_TYPE, 2)

1947

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1948

#endif // M0 > 5

1949

#if M0 > 6

1950

VEC_DATA_TYPE(DATA_TYPE, 2)

1951

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1952

#endif // M0 > 6

1953

#if M0 > 7

1954

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1955

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1956

#endif // M0 > 7

1957

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1958

VEC_DATA_TYPE(DATA_TYPE, N0)

1959

b0;

1960

1961

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1962

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1963

1964

lhs_offset += sizeof(DATA_TYPE);

1965

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1966

}

1967

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

1968

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1969

1970

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1971

1972

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1973

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1974

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1975

1976

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1977

// multiply dst_stride_z by DEPTH_GEMM3D

1978

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1979

1980

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1981

1982

// Add offset for batched GEMM

1983

dst_addr += z * dst_stride_z;

1984

1985

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1986

1987

// Multiply by the weight of matrix-matrix product and store the result

1988

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1989

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1990

#endif // defined(ALPHA)

1991

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1992

// Add beta*bias

1993

#if defined(BETA)

1994

#if defined(BROADCAST_BIAS)

1995

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1996

1997

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1998

1999

#ifndef UNIT_BETA

2000

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2001

#endif // UNIT_BIAS

2002

2003

// c = c + bias[broadcasted]

2004

ADD_BLOCK_BROADCAST(M0, c, bias0);

2005

2006

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2007

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

2008

2009

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2010

2011

#ifndef UNIT_BETA

2012

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2017

2018

#endif // defined(BROADCAST_BIAS)

2019

#endif // defined(BETA)

2020

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2021

#if defined(ACTIVATION_TYPE)

2022

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2023

#endif // defined(ACTIVATION_TYPE)

2024

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2025

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2026

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2027

2028

#undef RHS_BLOCK_SIZE

2029

#undef RHS_OFFSET_X

2030

#undef RHS_STEP_X

2031

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2032

2033

#if defined(OPENCL_IMAGE_SUPPORT)

2034

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2035

* The LHS matrix is NOT reshaped

2036

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2037

*

2038

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2039

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2040

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2041

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2042

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2043

* could be different from the value returned by get_image_height(rhs_img).

2044

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2045

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2046

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2047

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2048

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2049

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2050

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2051

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2057

* The activation function is performed after the bias addition

2058

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2059

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2060

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2061

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2062

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2063

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2064

*

2065

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2066

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2067

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2068

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2069

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2070

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2071

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2072

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2073

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2074

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2075

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2076

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2077

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2078

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2079

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2080

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2081

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2082

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2083

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2084

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2085

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2086

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2087

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2088

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2089

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2090

*/

2091

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2092

__read_only image2d_t rhs_img,

2093

#if defined(BETA)

2094

IMAGE_DECLARATION(bias),

2095

#endif // defined(BETA)

2096

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2102

uint dst_stride_z

2103

#if defined(REINTERPRET_INPUT_AS_3D)

2104

,

2105

uint lhs_cross_plane_pad

2106

#endif // REINTERPRET_INPUT_AS_3D

2107

#if defined(REINTERPRET_OUTPUT_AS_3D)

2108

,

2109

uint dst_cross_plane_pad

2110

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2115

2116

// Block size

2117

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2118

2119

// RHS offset and step X

2120

#if defined(RHS_INTERLEAVE)

2121

#define RHS_OFFSET_X (PIXEL_UNIT)

2122

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2123

#else // defined(RHS_INTERLEAVE)

2124

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2125

#define RHS_STEP_X (PIXEL_UNIT)

2126

#endif // defined(RHS_INTERLEAVE)

2127

2128

uint x = get_global_id(0);

2129

uint y = get_global_id(1);

2130

uint z = get_global_id(2);

2131

2132

#if defined(DUMMY_WORK_ITEMS)

2133

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2138

2139

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2140

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2141

2142

#if defined(MATRIX_B_DEPTH)

2143

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2144

const uint z_rhs = (z % MATRIX_B_DEPTH);

2145

#else // defined(MATRIX_B_DEPTH)

2146

const uint z_rhs = z;

2147

#endif // defined(MATRIX_B_DEPTH)

2148

2149

// Compute RHS matrix coordinates

2150

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2151

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2152

2153

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2154

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2155

2156

#if defined(REINTERPRET_INPUT_AS_3D)

2157

2158

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2159

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2160

2161

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2162

// multiply lhs_stride_z by DEPTH_GEMM3D

2163

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2164

2165

#else // defined(REINTERPRET_INPUT_AS_3D)

2166

2167

// Add offset for batched GEMM

2168

lhs_offset += z * lhs_stride_z;

2169

2170

#endif // defined(REINTERPRET_INPUT_AS_3D)

2171

2172

// Initialize the accumulators

2173

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2174

2175

int i = 0;

2176

for(; i <= (K - K0); i += K0)

2177

{

2178

// Load values from LHS matrix

2179

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2180

2181

VEC_DATA_TYPE(DATA_TYPE, N0)

2182

b0;

2183

2184

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2185

VFMA_M0xN0(0, a, b0, c);

2186

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2187

VFMA_M0xN0(1, a, b0, c);

2188

#if K0 > 2

2189

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2190

VFMA_M0xN0(2, a, b0, c);

2191

#endif // K0 > 2

2192

#if K0 > 3

2193

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2194

VFMA_M0xN0(3, a, b0, c);

2195

#endif // K0 > 3

2196

#if K0 > 4

2197

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2198

VFMA_M0xN0(4, a, b0, c);

2199

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2200

VFMA_M0xN0(5, a, b0, c);

2201

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2202

VFMA_M0xN0(6, a, b0, c);

2203

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2204

VFMA_M0xN0(7, a, b0, c);

2205

#endif // K0 > 4

2206

#if K0 > 8

2207

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2208

VFMA_M0xN0(8, a, b0, c);

2209

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2210

VFMA_M0xN0(9, a, b0, c);

2211

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2212

VFMA_M0xN0(A, a, b0, c);

2213

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2214

VFMA_M0xN0(B, a, b0, c);

2215

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2216

VFMA_M0xN0(C, a, b0, c);

2217

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2218

VFMA_M0xN0(D, a, b0, c);

2219

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2220

VFMA_M0xN0(E, a, b0, c);

2221

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2222

VFMA_M0xN0(F, a, b0, c);

2223

#endif // K0 > 8

2224

2225

lhs_offset += K0 * sizeof(DATA_TYPE);

2226

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2227

}

2228

2229

// Left-over accumulations

2230

for(; i < K; ++i)

2231

{

2232

// Load values from LHS matrix

2233

VEC_DATA_TYPE(DATA_TYPE, 2)

2234

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2235

#if M0 > 1

2236

VEC_DATA_TYPE(DATA_TYPE, 2)

2237

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2238

#endif // M0 > 1

2239

#if M0 > 2

2240

VEC_DATA_TYPE(DATA_TYPE, 2)

2241

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2242

#endif // M0 > 2

2243

#if M0 > 3

2244

VEC_DATA_TYPE(DATA_TYPE, 2)

2245

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2246

#endif // M0 > 3

2247

#if M0 > 4

2248

VEC_DATA_TYPE(DATA_TYPE, 2)

2249

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2250

#endif // M0 > 4

2251

#if M0 > 5

2252

VEC_DATA_TYPE(DATA_TYPE, 2)

2253

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2254

#endif // M0 > 5

2255

#if M0 > 6

2256

VEC_DATA_TYPE(DATA_TYPE, 2)

2257

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2258

#endif // M0 > 6

2259

#if M0 > 7

2260

VEC_DATA_TYPE(DATA_TYPE, 2)

2261

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2262

#endif // M0 > 7

2263

2264

VEC_DATA_TYPE(DATA_TYPE, N0)

2265

b0;

2266

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2267

2268

VFMA_M0xN0(0, a, b0, c);

2269

2270

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2274

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2275

2276

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2277

2278

#if defined(REINTERPRET_OUTPUT_AS_3D)

2279

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2280

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2281

2282

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2283

// multiply dst_stride_z by DEPTH_GEMM3D

2284

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2285

2286

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2287

2288

// Add offset for batched GEMM

2289

dst_addr += z * dst_stride_z;

2290

2291

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2292

2293

// Multiply by the weight of matrix-matrix product and store the result

2294

#if defined(ALPHA)

2295

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2296

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2301

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2302

2303

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2304

2305

#ifndef UNIT_BETA

2306

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2307

#endif // UNIT_BIAS

2308

2309

// c = c + bias[broadcasted]

2310

ADD_BLOCK_BROADCAST(M0, c, bias0);

2311

2312

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2313

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2314

2315

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2316

2317

#ifndef UNIT_BETA

2318

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2323

2324

#endif // defined(BROADCAST_BIAS)

2325

#endif // defined(BETA)

2326

2327

#if defined(ACTIVATION_TYPE)

2328

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2329

#endif // defined(ACTIVATION_TYPE)

2330

2331

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

2332

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2333

2334

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2339

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2340

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2341

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2342

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2343

#if defined(MIXED_PRECISION)

2344

#if K0 == 2

2345

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2351

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2358

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2366

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2378

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2398

#error "K0 value not supported"

2399

#endif // K0 conditions

2400

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2401

#if K0 == 2

2402

#define ARM_DOT_K0(a, b, c) \

2403

({ \

2404

c = fma(a.s0, b.s0, c); \

2405

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2406

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2407

#elif K0 == 3 // K0 == 3

2408

#define ARM_DOT_K0(a, b, c) \

2409

({ \

2410

c = fma(a.s0, b.s0, c); \

2411

c = fma(a.s1, b.s1, c); \

2412

c = fma(a.s2, b.s2, c); \

2413

})

2414

#elif K0 == 4 // K0 == 4

2415

#define ARM_DOT_K0(a, b, c) \

2416

({ \

2417

c = fma(a.s0, b.s0, c); \

2418

c = fma(a.s1, b.s1, c); \

2419

c = fma(a.s2, b.s2, c); \

2420

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2421

})

2422

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2423

#define ARM_DOT_K0(a, b, c) \

2424

({ \

2425

c = fma(a.s0, b.s0, c); \

2426

c = fma(a.s1, b.s1, c); \

2427

c = fma(a.s2, b.s2, c); \

2428

c = fma(a.s3, b.s3, c); \

2429

c = fma(a.s4, b.s4, c); \

2430

c = fma(a.s5, b.s5, c); \

2431

c = fma(a.s6, b.s6, c); \

2432

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2433

})

2434

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2435

#define ARM_DOT_K0(a, b, c) \

2436

({ \

2437

c = fma(a.s0, b.s0, c); \

2438

c = fma(a.s1, b.s1, c); \

2439

c = fma(a.s2, b.s2, c); \

2440

c = fma(a.s3, b.s3, c); \

2441

c = fma(a.s4, b.s4, c); \

2442

c = fma(a.s5, b.s5, c); \

2443

c = fma(a.s6, b.s6, c); \

2444

c = fma(a.s7, b.s7, c); \

2445

c = fma(a.s8, b.s8, c); \

2446

c = fma(a.s9, b.s9, c); \

2447

c = fma(a.sA, b.sA, c); \

2448

c = fma(a.sB, b.sB, c); \

2449

c = fma(a.sC, b.sC, c); \

2450

c = fma(a.sD, b.sD, c); \

2451

c = fma(a.sE, b.sE, c); \

2452

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2453

})

2454

#else // K0 not supported

2455

#error "K0 value not supported"

2456

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2457

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2458

2459

#if N0 == 2

2460

#define ARM_DOT_K0XN0(a, b, c) \

2461

({ \

2462

ARM_DOT_K0((a), (b##0), (c.s0)); \

2463

ARM_DOT_K0((a), (b##1), (c.s1)); \

2464

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2465

#elif N0 == 3 // N0 == 3

2466

#define ARM_DOT_K0XN0(a, b, c) \

2467

({ \

2468

ARM_DOT_K0((a), (b##0), (c.s0)); \

2469

ARM_DOT_K0((a), (b##1), (c.s1)); \

2470

ARM_DOT_K0((a), (b##2), (c.s2)); \

2471

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2472

#elif N0 == 4 // N0 == 4

2473

#define ARM_DOT_K0XN0(a, b, c) \

2474

({ \

2475

ARM_DOT_K0((a), (b##0), (c.s0)); \

2476

ARM_DOT_K0((a), (b##1), (c.s1)); \

2477

ARM_DOT_K0((a), (b##2), (c.s2)); \

2478

ARM_DOT_K0((a), (b##3), (c.s3)); \

2479

})

2480

#elif N0 == 8 // N0 == 8

2481

#define ARM_DOT_K0XN0(a, b, c) \

2482

({ \

2483

ARM_DOT_K0((a), (b##0), (c.s0)); \

2484

ARM_DOT_K0((a), (b##1), (c.s1)); \

2485

ARM_DOT_K0((a), (b##2), (c.s2)); \

2486

ARM_DOT_K0((a), (b##3), (c.s3)); \

2487

ARM_DOT_K0((a), (b##4), (c.s4)); \

2488

ARM_DOT_K0((a), (b##5), (c.s5)); \

2489

ARM_DOT_K0((a), (b##6), (c.s6)); \

2490

ARM_DOT_K0((a), (b##7), (c.s7)); \

2491

})

2492

#elif N0 == 16 // N0 == 16

2493

#define ARM_DOT_K0XN0(a, b, c) \

2494

({ \

2495

ARM_DOT_K0((a), (b##0), (c.s0)); \

2496

ARM_DOT_K0((a), (b##1), (c.s1)); \

2497

ARM_DOT_K0((a), (b##2), (c.s2)); \

2498

ARM_DOT_K0((a), (b##3), (c.s3)); \

2499

ARM_DOT_K0((a), (b##4), (c.s4)); \

2500

ARM_DOT_K0((a), (b##5), (c.s5)); \

2501

ARM_DOT_K0((a), (b##6), (c.s6)); \

2502

ARM_DOT_K0((a), (b##7), (c.s7)); \

2503

ARM_DOT_K0((a), (b##8), (c.s8)); \

2504

ARM_DOT_K0((a), (b##9), (c.s9)); \

2505

ARM_DOT_K0((a), (b##A), (c.sA)); \

2506

ARM_DOT_K0((a), (b##B), (c.sB)); \

2507

ARM_DOT_K0((a), (b##C), (c.sC)); \

2508

ARM_DOT_K0((a), (b##D), (c.sD)); \

2509

ARM_DOT_K0((a), (b##E), (c.sE)); \

2510

ARM_DOT_K0((a), (b##F), (c.sF)); \

2511

})

2512

#else // N0 not supported

2513

#error "N0 value not supported"

2514

#endif // N0 conditions

2515

2516

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2517

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2518

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2519

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2520

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2521

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2522

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2523

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2524

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2525

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2526

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2527

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2528

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2529

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2530

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2531

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2532

* - N0 = 2, 3, 4, 8, 16

2533

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2534

* - V0 >= 1

2535

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2536

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2537

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2538

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2539

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2540

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2541

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2542

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2543

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2544

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2545

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2546

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2547

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2548

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2549

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2550

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2551

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2552

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2553

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2554

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2555

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2556

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2557

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2558

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2559

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2560

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2561

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2562

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2563

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2564

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2565

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2566

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2567

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2568

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2569

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2570

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2571

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2572

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2573

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2574

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2575

*/

2576

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2577

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2578

#if defined(BETA)

2579

IMAGE_DECLARATION(bias),

2580

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2581

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2582

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2583

uint lhs_stride_z,

2584

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2585

#if defined(BETA)

2586

uint bias_stride_z,

2587

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2588

uint dst_stride_z

2589

#if defined(REINTERPRET_OUTPUT_AS_3D)

2590

,

2591

uint dst_cross_plane_pad

2592

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2597

2598

#if defined(LHS_INTERLEAVE)

2599

#define LHS_OFFSET_X (K0)

2600

#define LHS_STEP_X ((K0) * (V0))

2601

#define LHS_STEP_LOOP (1)

2602

#else // defined(INTERLEAVE)

2603

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2604

#define LHS_STEP_X (K0)

2605

#define LHS_STEP_LOOP (V0)

2606

#endif // defined(INTERLEAVE)

2607

2608

// Block size

2609

#define RHS_BLOCK_SIZE ((K0) * (N0))

2610

2611

// RHS offset and step X

2612

#if defined(RHS_INTERLEAVE)

2613

#define RHS_OFFSET_X (K0)

2614

#define RHS_STEP_X ((K0) * (H0))

2615

#define RHS_STEP_LOOP (1)

2616

#else // defined(RHS_INTERLEAVE)

2617

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2618

#define RHS_STEP_X (K0)

2619

#define RHS_STEP_LOOP (H0)

2620

#endif // defined(RHS_INTERLEAVE)

2621

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2622

#if defined(DUMMY_WORK_ITEMS)

2623

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2628

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2629

// Compute LHS matrix address

2630

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2631

(get_global_id(2) * lhs_stride_z);

2632

2633

// Compute RHS matrix address

2634

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2635

2636

#if defined(MATRIX_B_DEPTH)

2637

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2638

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2639

#else // defined(MATRIX_B_DEPTH)

2640

rhs_addr += get_global_id(2) * rhs_stride_z;

2641

#endif // defined(MATRIX_B_DEPTH)

2642

2643

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2644

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2645

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2646

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2647

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2648

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2649

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2650

{

2651

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2652

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2653

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2654

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2655

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2656

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2657

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2658

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2659

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2660

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2661

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2662

2663

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2664

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2665

2666

// Accumulate

2667

ARM_DOT_K0XN0(a0, b, c0);

2668

#if M0 > 1

2669

ARM_DOT_K0XN0(a1, b, c1);

2670

#endif // M0 > 1

2671

#if M0 > 2

2672

ARM_DOT_K0XN0(a2, b, c2);

2673

#endif // M0 > 2

2674

#if M0 > 3

2675

ARM_DOT_K0XN0(a3, b, c3);

2676

#endif // M0 > 3

2677

#if M0 > 4

2678

ARM_DOT_K0XN0(a4, b, c4);

2679

#endif // M0 > 4

2680

#if M0 > 5

2681

ARM_DOT_K0XN0(a5, b, c5);

2682

#endif // M0 > 5

2683

#if M0 > 6

2684

ARM_DOT_K0XN0(a6, b, c6);

2685

#endif // M0 > 6

2686

#if M0 > 7

2687

ARM_DOT_K0XN0(a7, b, c7);

2688

#endif // M0 > 7

2689

2690

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2691

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2692

}

2693

2694

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2695

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2696

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2697

2698

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2699

2700

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2701

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2702

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2703

// multiply dst_stride_z by DEPTH_GEMM3D

2704

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2705

2706

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2707

2708

// Add offset for batched GEMM

2709

dst_addr += get_global_id(2) * dst_stride_z;

2710

2711

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2712

2713

// Multiply by the weight of matrix-matrix product and store the result

2714

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2715

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2716

#endif // defined(ALPHA)

2717

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2718

// Add beta*bias

2719

#if defined(BETA)

2720

#if defined(BROADCAST_BIAS)

2721

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2722

2723

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2724

2725

#ifndef UNIT_BETA

2726

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2727

#endif // UNIT_BIAS

2728

2729

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2730

#if defined(MIXED_PRECISION)

2731

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2732

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2733

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2734

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2735

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2736

2737

#else // defined(BROADCAST_BIAS)

2738

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2739

2) * bias_stride_z;

2740

2741

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2742

2743

#ifndef UNIT_BETA

2744

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2745

#endif // UNIT_BIAS

2746

2747

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2748

#if defined(MIXED_PRECISION)

2749

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2750

ADD_BLOCK(M0, c, bias_hp);

2751

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2752

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2753

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2754

2755

#endif // defined(BROADCAST_BIAS)

2756

#endif // defined(BETA)

2757

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2758

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2759

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2760

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2761

#else // defined(MIXED_PRECISION)

2762

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2763

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2764

#endif // defined(ACTIVATION_TYPE)

2765

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2766

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2767

#if defined(MIXED_PRECISION)

2768

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2769

#else // defined(MIXED_PRECISION)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2770

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2771

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2772

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2773

#undef LHS_BLOCK_SIZE

2774

#undef LHS_OFFSET_X

2775

#undef LHS_STEP_X

2776

#undef RHS_BLOCK_SIZE

2777

#undef RHS_OFFSET_X

2778

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2779

#undef LHS_STEP_LOOP

2780

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2781

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2782

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2783

#if defined(OPENCL_IMAGE_SUPPORT)

2784

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2785

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2786

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2787

*

2788

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2789

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2790

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2791

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2792

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2793

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2794

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2795

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2796

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2797

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2798

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2799

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2800

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2801

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2802

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2803

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2810

* The activation function is performed after the bias addition

2811

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2812

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2813

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2814

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2815

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2816

*

2817

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2818

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2819

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2820

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2821

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2822

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2823

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2824

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2825

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2826

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2827

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2828

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2829

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2830

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2831

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2832

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2833

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2834

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2835

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2836

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2837

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2838

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2839

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2840

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2841

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2842

*/

2843

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2844

__read_only image2d_t rhs_img,

2845

#if defined(BETA)

2846

IMAGE_DECLARATION(bias),

2847

#endif // defined(BETA)

2848

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2849

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2855

uint dst_stride_z

2856

#if defined(REINTERPRET_OUTPUT_AS_3D)

2857

,

2858

uint dst_cross_plane_pad

2859

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2864

2865

// Block size

2866

#define LHS_BLOCK_SIZE ((K0) * (M0))

2867

2868

#if defined(LHS_INTERLEAVE)

2869

#define LHS_OFFSET_X (K0)

2870

#define LHS_STEP_X ((K0) * (V0))

2871

#define LHS_STEP_LOOP (1)

2872

#else // defined(INTERLEAVE)

2873

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2874

#define LHS_STEP_X (K0)

2875

#define LHS_STEP_LOOP (V0)

2876

#endif // defined(INTERLEAVE)

2877

2878

// Block size

2879

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2880

2881

// RHS offset and step X

2882

#if defined(RHS_INTERLEAVE)

2883

#define RHS_OFFSET_X (PIXEL_UNIT)

2884

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2885

#define RHS_STEP_LOOP (1)

2886

#else // defined(RHS_INTERLEAVE)

2887

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2888

#define RHS_STEP_X PIXEL_UNIT

2889

#define RHS_STEP_LOOP (H0)

2890

#endif // defined(RHS_INTERLEAVE)

2891

2892

#if defined(DUMMY_WORK_ITEMS)

2893

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2898

2899

// Compute LHS matrix address

2900

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2901

(get_global_id(2) * lhs_stride_z);

2902

2903

#if defined(MATRIX_B_DEPTH)

2904

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2905

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2906

#else // defined(MATRIX_B_DEPTH)

2907

const uint z_rhs = get_global_id(2);

2908

#endif // defined(MATRIX_B_DEPTH)

2909

2910

// Compute RHS matrix coordinates

2911

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2912

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2913

2914

// Initialize the accumulators

2915

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2916

2917

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2918

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2919

2920

for(int i = 0; i < K; i += K0)

2921

{

2922

// Load values from LHS matrix

2923

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2924

2925

// Load values from RHS matrix stored in a cl_image

2926

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2927

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2928

2929

// Accumulate

2930

ARM_DOT_K0XN0(a0, b, c0);

2931

#if M0 > 1

2932

ARM_DOT_K0XN0(a1, b, c1);

2933

#endif // M0 > 1

2934

#if M0 > 2

2935

ARM_DOT_K0XN0(a2, b, c2);

2936

#endif // M0 > 2

2937

#if M0 > 3

2938

ARM_DOT_K0XN0(a3, b, c3);

2939

#endif // M0 > 3

2940

#if M0 > 4

2941

ARM_DOT_K0XN0(a4, b, c4);

2942

#endif // M0 > 4

2943

#if M0 > 5

2944

ARM_DOT_K0XN0(a5, b, c5);

2945

#endif // M0 > 5

2946

#if M0 > 6

2947

ARM_DOT_K0XN0(a6, b, c6);

2948

#endif // M0 > 6

2949

#if M0 > 7

2950

ARM_DOT_K0XN0(a7, b, c7);

2951

#endif // M0 > 7

2952

2953

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2954

2955

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2956

}

2957

2958

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2959

2960

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2961

2962

#if defined(REINTERPRET_OUTPUT_AS_3D)

2963

2964

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2965

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2966

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2967

// multiply dst_stride_z by DEPTH_GEMM3D

2968

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2969

2970

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2971

2972

// Add offset for batched GEMM

2973

dst_addr += get_global_id(2) * dst_stride_z;

2974

2975

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2976

2977

// Multiply by the weight of matrix-matrix product and store the result

2978

#if defined(ALPHA)

2979

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2980

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2985

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2986

2987

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2988

2989

#ifndef UNIT_BETA

2990

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2991

#endif // UNIT_BIAS

2992

2993

// c = c + bias[broadcasted]

2994

#if defined(MIXED_PRECISION)

2995

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2996

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2997

#else // defined(MIXED_PRECISION)

2998

ADD_BLOCK_BROADCAST(M0, c, bias0);

2999

#endif // defined(MIXED_PRECISION)

3000

3001

#else // defined(BROADCAST_BIAS)

3002

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3003

2) * bias_stride_z;

3004

3005

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3006

3007

#ifndef UNIT_BETA

3008

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3013

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3014

ADD_BLOCK(M0, c, bias_hp);

3015

#else // defined(MIXED_PRECISION)

3016

ADD_BLOCK(M0, c, bias);

3017

#endif // defined(MIXED_PRECISION)

3018

3019

#endif // defined(BROADCAST_BIAS)

3020

#endif // defined(BETA)

3021

3022

#if defined(ACTIVATION_TYPE)

3023

#if defined(MIXED_PRECISION)

3024

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3025

#else // defined(MIXED_PRECISION)

3026

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3027

#endif // defined(MIXED_PRECISION)

3028

#endif // defined(ACTIVATION_TYPE)

3029

3030

// Store output block

3031

#if defined(MIXED_PRECISION)

3032

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3033

#else // defined(MIXED_PRECISION)

3034

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3035

#endif // defined(MIXED_PRECISION)

3036

3037

#undef LHS_BLOCK_SIZE

3038

#undef LHS_OFFSET_X

3039

#undef LHS_STEP_X

3040

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3048

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3049

#if defined(LHS_TRANSPOSE)

3050

3051

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3052

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3053

#if defined(MIXED_PRECISION)

3054

3055

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3056

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3057

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3058

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3059

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3060

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3061

#else // defined(MIXED_PRECISION

3062

3063

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3064

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3065

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3066

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3067

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3068

3069

#endif // defined(MIXED_PRECISION)

3070

3071

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3072

({ \

3073

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3074

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3075

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3076

({ \

3077

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3078

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3079

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3080

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3081

({ \

3082

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3083

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3084

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3085

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3086

({ \

3087

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3088

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3089

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3090

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3091

({ \

3092

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3093

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3094

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3095

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3096

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3097

})

3098

3099

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3100

// a is the column-vector (transposed)

3101

// b is the row-vector (not transposed)

3102

// C is the output matrix

3103

// Lower case is a vector (a, b)

3104

// Upper case is a matrix (C)

3105

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3106

3107

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3108

({ \

3109

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3110

})

3111

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3112

({ \

3113

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3114

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3115

})

3116

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3117

({ \

3118

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3119

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3120

})

3121

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3122

({ \

3123

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3124

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3125

})

3126

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3127

({ \

3128

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3129

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3130

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3131

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3132

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3133

})

3134

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3135

({ \

3136

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3137

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3138

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3139

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3140

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3141

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3142

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3143

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3144

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3145

})

3146

3147

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3148

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3149

// The dimensions supported are:

3150

// M0: 1, 2, 3, 4, 8

3151

// N0: 1, 2, 3, 4, 8, 16

3152

// K0: 1, 2, 3, 4, 8, 16

3153

// This macro calls the vector-by-matrix macro K0 times

3154

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3155

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3156

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3157

(M0, N0, TYPE, A, B, C)

3158

3159

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3160

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3161

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3162

*

3163

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3164

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3165

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3166

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3167

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3168

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3169

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3170

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3171

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3172

* - M0 = 2, 3, 4, 8

3173

* - N0 = 2, 3, 4, 8, 16

3174

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3179

* The activation function is performed after the bias addition

3180

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3181

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3182

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3183

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3184

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3185

*

3186

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3187

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3188

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3189

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3190

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3191

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3192

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3193

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3194

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3195

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3196

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3197

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3198

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3199

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3200

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3201

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3202

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3203

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3204

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3205

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3206

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3207

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3208

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3209

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3210

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3211

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3212

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3213

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3214

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3215

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3216

*/

3217

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3218

IMAGE_DECLARATION(rhs),

3219

#if defined(BETA)

3220

IMAGE_DECLARATION(bias),

3221

#endif // defined(BETA)

3222

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3223

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3229

uint dst_stride_z

3230

#if defined(REINTERPRET_OUTPUT_AS_3D)

3231

,

3232

uint dst_cross_plane_pad

3233

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3238

3239

#if defined(LHS_INTERLEAVE)

3240

#define LHS_OFFSET_X (M0)

3241

#define LHS_STEP_X ((M0) * (V0))

3242

#define LHS_STEP_LOOP (1)

3243

#else // defined(INTERLEAVE)

3244

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3245

#define LHS_STEP_X (M0)

3246

#define LHS_STEP_LOOP (V0)

3247

#endif // defined(INTERLEAVE)

3248

3249

// Block size

3250

#define RHS_BLOCK_SIZE ((K0) * (N0))

3251

3252

// RHS offset and step X

3253

#if defined(RHS_INTERLEAVE)

3254

#define RHS_OFFSET_X (N0)

3255

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3256

#else // defined(RHS_INTERLEAVE)

3257

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3258

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3259

#endif // defined(RHS_INTERLEAVE)

3260

3261

const uint x = get_global_id(0);

3262

const uint y = get_global_id(1);

3263

const uint z = get_global_id(2);

3264

3265

#if defined(DUMMY_WORK_ITEMS)

3266

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3271

3272

// Compute LHS matrix address

3273

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3274

3275

// Compute RHS matrix address

3276

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3277

3278

#if defined(MATRIX_B_DEPTH)

3279

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3280

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3281

#else // defined(MATRIX_B_DEPTH)

3282

rhs_addr += z * rhs_stride_z;

3283

#endif // defined(MATRIX_B_DEPTH)

3284

3285

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3286

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3287

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3288

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3289

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3290

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3291

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3292

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3293

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3294

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3295

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3296

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3297

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3298

b0;

3299

3300

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3301

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3302

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3303

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3304

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3305

lhs += LHS_STEP_X;

3306

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3307

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3308

#if K0 > 1

3309

a0 = VLOAD(M0)(0, lhs);

3310

b0 = VLOAD(N0)(0, rhs);

3311

3312

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3320

b0 = VLOAD(N0)(0, rhs);

3321

3322

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3330

b0 = VLOAD(N0)(0, rhs);

3331

3332

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3340

b0 = VLOAD(N0)(0, rhs);

3341

3342

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3348

b0 = VLOAD(N0)(0, rhs);

3349

3350

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3356

b0 = VLOAD(N0)(0, rhs);

3357

3358

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3364

b0 = VLOAD(N0)(0, rhs);

3365

3366

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3374

b0 = VLOAD(N0)(0, rhs);

3375

3376

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3382

b0 = VLOAD(N0)(0, rhs);

3383

3384

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3390

b0 = VLOAD(N0)(0, rhs);

3391

3392

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3398

b0 = VLOAD(N0)(0, rhs);

3399

3400

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3406

b0 = VLOAD(N0)(0, rhs);

3407

3408

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3414

b0 = VLOAD(N0)(0, rhs);

3415

3416

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3422

b0 = VLOAD(N0)(0, rhs);

3423

3424

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3430

b0 = VLOAD(N0)(0, rhs);

3431

3432

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3439

lhs += (M0 * K0 * (V0 - 1));

3440

#endif // LHS_INTERLEAVE

3441

3442

#ifndef RHS_INTERLEAVE

3443

rhs += (N0 * K0 * (H0 - 1));

3444

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3445

}

3446

3447

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3448

3449

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3450

3451

#if defined(REINTERPRET_OUTPUT_AS_3D)

3452

3453

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3454

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3455

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3456

// multiply dst_stride_z by DEPTH_GEMM3D

3457

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3458

3459

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3460

3461

// Add offset for batched GEMM

3462

dst_addr += z * dst_stride_z;

3463

3464

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3465

3466

// Multiply by the weight of matrix-matrix product and store the result

3467

#if defined(ALPHA)

3468

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3469

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3474

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3475

3476

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3477

3478

#ifndef UNIT_BETA

3479

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3480

#endif // UNIT_BIAS

3481

3482

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3483

#if defined(MIXED_PRECISION)

3484

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3485

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3486

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3487

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3488

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3489

3490

#else // defined(BROADCAST_BIAS)

3491

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3492

3493

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3494

3495

#ifndef UNIT_BETA

3496

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3497

#endif // UNIT_BIAS

3498

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3499

#if defined(MIXED_PRECISION)

3500

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3501

ADD_BLOCK(M0, c, bias_hp);

3502

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3503

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3504

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3505

3506

#endif // defined(BROADCAST_BIAS)

3507

#endif // defined(BETA)

3508

3509

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3510

#if defined(MIXED_PRECISION)

3511

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3512

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3513

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3514

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3515

#endif // defined(ACTIVATION_TYPE)

3516

3517

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3518

#if defined(MIXED_PRECISION)

3519

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3520

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3521

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3522

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3523

3524

#undef LHS_BLOCK_SIZE

3525

#undef LHS_OFFSET_X

3526

#undef LHS_STEP_X

3527

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3532

#if defined(OPENCL_IMAGE_SUPPORT)

3533

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3534

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3535

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3536

*

3537

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3538

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3539

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3540

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3541

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3542

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3543

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3544

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3545

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3546

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3547

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3548

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3549

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3557

* The activation function is performed after the bias addition

3558

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3559

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3560

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3561

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3562

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3563

*

3564

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3565

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3566

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3567

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3568

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3569

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3570

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3571

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3572

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3573

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3574

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3575

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3576

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3577

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3582

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3583

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3584

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3585

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3586

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3587

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3588

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3589

*/

3590

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3591

__read_only image2d_t rhs_img,

3592

#if defined(BETA)

3593

IMAGE_DECLARATION(bias),

3594

#endif // defined(BETA)

3595

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3596

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3602

uint dst_stride_z

3603

#if defined(REINTERPRET_OUTPUT_AS_3D)

3604

,

3605

uint dst_cross_plane_pad

3606

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3611

3612

// Block size

3613

#define LHS_BLOCK_SIZE ((K0) * (M0))

3614

3615

#if defined(LHS_INTERLEAVE)

3616

#define LHS_OFFSET_X (M0)

3617

#define LHS_STEP_X ((M0) * (V0))

3618

#define LHS_STEP_LOOP (1)

3619

#else // defined(INTERLEAVE)

3620

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3621

#define LHS_STEP_X (M0)

3622

#define LHS_STEP_LOOP (V0)

3623

#endif // defined(INTERLEAVE)

3624

3625

// Block size

3626

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3627

3628

// RHS offset and step X

3629

#if defined(RHS_INTERLEAVE)

3630

#define RHS_OFFSET_X (PIXEL_UNIT)

3631

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3632

#else // defined(RHS_INTERLEAVE)

3633

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3634

#define RHS_STEP_X (PIXEL_UNIT)

3635

#endif // defined(RHS_INTERLEAVE)

3636

3637

const uint x = get_global_id(0);

3638

const uint y = get_global_id(1);

3639

const uint z = get_global_id(2);

3640

3641

#if defined(DUMMY_WORK_ITEMS)

3642

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3647

3648

// Compute LHS matrix address

3649

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3650

3651

#if defined(MATRIX_B_DEPTH)

3652

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3653

const uint z_rhs = (z % MATRIX_B_DEPTH);

3654

#else // defined(MATRIX_B_DEPTH)

3655

const uint z_rhs = z;

3656

#endif // defined(MATRIX_B_DEPTH)

3657

3658

// Compute RHS matrix coordinates

3659

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3660

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3661

3662

// Initialize the accumulators

3663

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3664

3665

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3666

3667

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3668

3669

for(int i = 0; i < K; i += K0)

3670

{

3671

VEC_DATA_TYPE(DATA_TYPE, M0)

3672

a0;

3673

VEC_DATA_TYPE(DATA_TYPE, N0)

3674

b0;

3675

3676

a0 = VLOAD(M0)(0, lhs);

3677

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3678

3679

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3685

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3686

3687

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3694

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3695

3696

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3703

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3704

3705

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3712

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3713

3714

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3719

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3720

3721

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3726

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3727

3728

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3733

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3734

3735

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3742

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3743

3744

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3749

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3750

3751

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3756

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3757

3758

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3763

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3764

3765

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3770

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3771

3772

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3777

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3778

3779

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3784

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3785

3786

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3791

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3792

3793

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3799

lhs += (M0 * K0 * (V0 - 1));

3800

#endif // LHS_INTERLEAVE

3801

3802

x_rhs += K0 * RHS_STEP_X;

3803

#ifndef RHS_INTERLEAVE

3804

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3805

#endif // RHS_INTERLEAVE

3806

}

3807

3808

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3809

3810

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3811

3812

#if defined(REINTERPRET_OUTPUT_AS_3D)

3813

3814

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3815

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3816

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3817

// multiply dst_stride_z by DEPTH_GEMM3D

3818

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3819

3820

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3821

3822

// Add offset for batched GEMM

3823

dst_addr += z * dst_stride_z;

3824

3825

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3826

3827

// Multiply by the weight of matrix-matrix product and store the result

3828

#if defined(ALPHA)

3829

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3830

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3835

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3836

3837

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3838

3839

#ifndef UNIT_BETA

3840

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3841

#endif // UNIT_BIAS

3842

3843

// c = c + bias[broadcasted]

3844

#if defined(MIXED_PRECISION)

3845

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3846

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3847

#else // defined(MIXED_PRECISION)

3848

ADD_BLOCK_BROADCAST(M0, c, bias0);

3849

#endif // defined(MIXED_PRECISION)

3850

3851

#else // defined(BROADCAST_BIAS)

3852

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3853

3854

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3855

3856

#ifndef UNIT_BETA

3857

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3858

#endif // UNIT_BIAS

3859

3860

#if defined(MIXED_PRECISION)

3861

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3862

ADD_BLOCK(M0, c, bias_hp);

3863

#else // defined(MIXED_PRECISION)

3864

ADD_BLOCK(M0, c, bias);

3865

#endif // defined(MIXED_PRECISION)

3866

3867

#endif // defined(BROADCAST_BIAS)

3868

#endif // defined(BETA)

3869

3870

#if defined(ACTIVATION_TYPE)

3871

#if defined(MIXED_PRECISION)

3872

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3873

#else // defined(MIXED_PRECISION)

3874

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3875

#endif // defined(MIXED_PRECISION)

3876

#endif // defined(ACTIVATION_TYPE)

3877

3878

// Store output block

3879

#if defined(MIXED_PRECISION)

3880

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3881

#else // defined(MIXED_PRECISION)

3882

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3883

#endif // defined(MIXED_PRECISION)

3884

3885

#undef LHS_BLOCK_SIZE

3886

#undef LHS_OFFSET_X

3887

#undef LHS_STEP_X

3888

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3896

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3897

#endif // defined(LHS_TRANSPOSE)

3898

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3899

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3900

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3901

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3902

3903

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3910

({ \

3911

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3912

})

3913

#elif M0 == 2 // M0 == 2

3914

#define RHS_VFMA_M0xN0(i, a, b, c) \

3915

({ \

3916

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3917

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3918

})

3919

#elif M0 == 3 // M0 == 3

3920

#define RHS_VFMA_M0xN0(i, a, b, c) \

3921

({ \

3922

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3923

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3924

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3925

})

3926

#elif M0 == 4 // M0 == 4

3927

#define RHS_VFMA_M0xN0(i, a, b, c) \

3928

({ \

3929

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3930

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3931

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3932

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3933

})

3934

#elif M0 == 5 // M0 == 5

3935

#define RHS_VFMA_M0xN0(i, a, b, c) \

3936

({ \

3937

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3938

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3939

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3940

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3941

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3942

})

3943

#elif M0 == 6 // M0 == 6

3944

#define RHS_VFMA_M0xN0(i, a, b, c) \

3945

({ \

3946

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3947

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3948

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3949

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3950

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3951

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3952

})

3953

#elif M0 == 7 // M0 == 7

3954

#define RHS_VFMA_M0xN0(i, a, b, c) \

3955

({ \

3956

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3957

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3958

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3959

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3960

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3961

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3962

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3963

})

3964

#elif M0 == 8 // M0 == 8

3965

#define RHS_VFMA_M0xN0(i, a, b, c) \

3966

({ \

3967

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3968

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3969

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3970

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3971

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3972

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3973

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3974

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

3975

})

3976

#else // M0 not supported

3977

#error "M0 not supported"

3978

#endif // M0 not supported

3979

3980

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3981

* The LHS matrix is NOT reshaped

3982

* The RHS matrix is NOT reshaped

3983

*

3984

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3985

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

3986

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

3987

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

3988

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

3989

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

3990

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3991

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3992

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3993

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

3994

* - N0 = 2, 3, 4, 8, 16

3995

* - K0 = 2, 3, 4, 8, 16

3996

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3997

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3998

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3999

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4000

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

4001

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4002

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4003

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4004

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

4005

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4006

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

4007

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4008

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4009

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4010

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4011

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4012

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4013

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4014

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4015

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4016

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4017

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4018

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4019

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4020

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4021

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4022

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4023

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4024

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4025

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4026

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4027

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4028

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4029

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4030

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4031

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4032

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4033

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4034

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4035

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4036

*/

4037

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4038

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4039

#if defined(BETA)

4040

IMAGE_DECLARATION(bias),

4041

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4042

IMAGE_DECLARATION(dst),

4043

uint lhs_stride_z,

4044

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4045

#if defined(BETA)

4046

uint bias_stride_z,

4047

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4048

uint dst_stride_z

4049

#if defined(REINTERPRET_INPUT_AS_3D)

4050

,

4051

uint lhs_cross_plane_pad

4052

#endif // REINTERPRET_INPUT_AS_3D

4053

#if defined(REINTERPRET_OUTPUT_AS_3D)

4054

,

4055

uint dst_cross_plane_pad

4056

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4061

4062

// RHS offset and step X

4063

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4064

4065

uint x = get_global_id(0);

4066

uint y = get_global_id(1);

4067

uint z = get_global_id(2);

4068

4069

#if defined(DUMMY_WORK_ITEMS)

4070

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4075

4076

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

4077

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4078

4079

// Compute RHS matrix address

4080

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4081

4082

#if defined(MATRIX_B_DEPTH)

4083

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4084

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4085

#else // defined(MATRIX_B_DEPTH)

4086

rhs_offset += z * rhs_stride_z;

4087

#endif // defined(MATRIX_B_DEPTH)

4088

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4089

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4090

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4091

4092

#if defined(REINTERPRET_INPUT_AS_3D)

4093

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4094

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

4095

4096

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4097

// multiply lhs_stride_z by DEPTH_GEMM3D

4098

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4099

4100

#else // defined(REINTERPRET_INPUT_AS_3D)

4101

4102

// Add offset for batched GEMM

4103

lhs_offset += z * lhs_stride_z;

4104

4105

#endif // defined(REINTERPRET_INPUT_AS_3D)

4106

4107

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4108

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4109

4110

int i = 0;

4111

for(; i <= (K - K0); i += K0)

4112

{

4113

// Supported cases (M0, K0):

4114

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4115

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4116

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4117

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4118

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4119

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4120

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4121

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4122

// Load values from LHS matrix

4123

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4124

4125

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4126

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4127

4128

RHS_VFMA_M0xN0(0, a, b0, c);

4129

RHS_VFMA_M0xN0(1, a, b1, c);

4130

#if K0 > 2

4131

RHS_VFMA_M0xN0(2, a, b2, c);

4132

#endif // K0 > 2

4133

#if K0 > 3

4134

RHS_VFMA_M0xN0(3, a, b3, c);

4135

#endif // K0 > 3

4136

#if K0 > 4

4137

RHS_VFMA_M0xN0(4, a, b4, c);

4138

RHS_VFMA_M0xN0(5, a, b5, c);

4139

RHS_VFMA_M0xN0(6, a, b6, c);

4140

RHS_VFMA_M0xN0(7, a, b7, c);

4141

#endif // K0 > 4

4142

#if K0 > 8

4143

RHS_VFMA_M0xN0(8, a, b8, c);

4144

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4145

RHS_VFMA_M0xN0(A, a, bA, c);

4146

RHS_VFMA_M0xN0(B, a, bB, c);

4147

RHS_VFMA_M0xN0(C, a, bC, c);

4148

RHS_VFMA_M0xN0(D, a, bD, c);

4149

RHS_VFMA_M0xN0(E, a, bE, c);

4150

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4151

#endif // K0 > 8

4152

4153

lhs_offset += K0 * sizeof(DATA_TYPE);

4154

rhs_offset += K0 * rhs_stride_y;

4155

}

4156

4157

// Left-over accumulations

4158

for(; i < K; ++i)

4159

{

4160

// Load values from LHS matrix

4161

VEC_DATA_TYPE(DATA_TYPE, 2)

4162

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4163

#if M0 > 1

4164

VEC_DATA_TYPE(DATA_TYPE, 2)

4165

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4166

#endif // M0 > 1

4167

#if M0 > 2

4168

VEC_DATA_TYPE(DATA_TYPE, 2)

4169

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4170

#endif // M0 > 2

4171

#if M0 > 3

4172

VEC_DATA_TYPE(DATA_TYPE, 2)

4173

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4174

#endif // M0 > 3

4175

#if M0 > 4

4176

VEC_DATA_TYPE(DATA_TYPE, 2)

4177

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4178

#endif // M0 > 4

4179

#if M0 > 5

4180

VEC_DATA_TYPE(DATA_TYPE, 2)

4181

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4182

#endif // M0 > 5

4183

#if M0 > 6

4184

VEC_DATA_TYPE(DATA_TYPE, 2)

4185

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4186

#endif // M0 > 6

4187

#if M0 > 7

4188

VEC_DATA_TYPE(DATA_TYPE, 2)

4189

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4190

#endif // M0 > 7

4191

4192

VEC_DATA_TYPE(DATA_TYPE, N0)

4193

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4194

RHS_VFMA_M0xN0(0, a, b, c);

4195

4196

lhs_offset += sizeof(DATA_TYPE);

4197

rhs_offset += rhs_stride_y;

4198

}

4199

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

4200

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4201

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4202

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4203

4204

#if defined(REINTERPRET_OUTPUT_AS_3D)

4205

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4206

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

4207

4208

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4209

// multiply dst_stride_z by DEPTH_GEMM3D

4210

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4211

4212

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4213

4214

// Add offset for batched GEMM

4215

dst_addr += z * dst_stride_z;

4216

4217

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4218

4219

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4220

#if defined(ALPHA)

4221

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4222

#endif // defined(ALPHA)

4223

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4224

// Add beta*bias

4225

#if defined(BETA)

4226

#if defined(BROADCAST_BIAS)

4227

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4228

4229

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4230

4231

#ifndef UNIT_BETA

4232

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4233

#endif // UNIT_BIAS

4234

4235

// c = c + bias[broadcasted]

4236

ADD_BLOCK_BROADCAST(M0, c, bias0);

4237

4238

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

4239

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4240

4241

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4242

4243

#ifndef UNIT_BETA

4244

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4249

4250

#endif // defined(BROADCAST_BIAS)

4251

#endif // defined(BETA)

4252

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4253

#if defined(ACTIVATION_TYPE)

4254

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

4255

#endif // defined(ACTIVATION_TYPE)

4256

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4257

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame^]

4258

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4259

4260

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4265

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4266

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4267

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4268

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4269

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4270

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4271

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4272

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4273

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4274

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4275

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4276

* The activation function is performed after the bias addition

4277

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4278

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4279

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4280

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4281

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4282

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4283

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4284

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4285

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4286

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4287

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4288

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4289

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4290

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4291

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4292

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4293

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4294

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4295

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4296

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4297

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4298

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4299

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4300

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4301

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4302

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4303

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4304

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4305

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4306

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4307

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4308

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4309

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4310

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4311

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4312

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4313

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

4314

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4315

#if defined(BETA)

4316

IMAGE_DECLARATION(src2),

4317

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4318

IMAGE_DECLARATION(dst),

4319

uint src0_stride_z,

4320

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4321

#if defined(BETA)

4322

uint src2_stride_z,

4323

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4324

uint dst_stride_z

4325

#if defined(REINTERPRET_OUTPUT_AS_3D)

4326

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4327

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4328

#endif // REINTERPRET_OUTPUT_AS_3D

4329

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4330

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4331

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4332

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4333

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4334

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4335

// Offset

4336

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4337

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4338

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4339

// src_addr_a = address of matrix A

4340

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4341

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4342

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4343

4344

#if defined(MATRIX_B_DEPTH)

4345

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4346

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4347

#else // defined(MATRIX_B_DEPTH)

4348

src1_addr_in_bytes += z * src1_stride_z;

4349

#endif // defined(MATRIX_B_DEPTH)

4350

4351

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4352

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4353

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4354

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4355

__global float *src_end_addr_b = src_addr_b + COLS_B;

4356

4357

src_addr_a += offset_row_a;

4358

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4359

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4360

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4365

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4366

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4367

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4368

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4369

float4 a0 = vload4(0, src_addr_a);

4370

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4371

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4372

c0 += (float4)a0.s0 * b0;

4373

c1 += (float4)a0.s1 * b0;

4374

c2 += (float4)a0.s2 * b0;

4375

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4376

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4377

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4378

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4379

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4380

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4381

c0 += (float4)a0.s0 * b0;

4382

c1 += (float4)a0.s1 * b0;

4383

c2 += (float4)a0.s2 * b0;

4384

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4385

}

4386

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4387

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4388

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4389

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4390

float4 a0 = vload4(0, src_addr_a);

4391

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4392

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4393

c0 += (float4)a0.s0 * b0;

4394

c1 += (float4)a0.s1 * b0;

4395

c2 += (float4)a0.s2 * b0;

4396

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4397

}

4398

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4399

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4400

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4401

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4402

// Compute dst address

4403

__global uchar *dst_addr = offset(&dst, 0, 0);

4404

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4405

uint4 zout = 0;

4406

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4407

#if defined(REINTERPRET_OUTPUT_AS_3D)

4408

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4409

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4410

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4415

// |******************|

4416

// | cross_plane_pad |

4417

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4422

4423

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4424

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4425

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4426

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4427

// Add offset due to the cross plane paddings

4428

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4429

4430

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4431

// multiply dst_stride_z by DEPTH_GEMM3D

4432

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4433

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4434

// Add offset for batched GEMM

4435

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4436

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4437

4438

// Multiply by the weight of matrix-matrix product and store the result

4439

#if defined(ALPHA)

4440

SCALE_BLOCK(4, float, c, ALPHA);

4441

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4446

4447

#if defined(BROADCAST_BIAS)

4448

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4449

4450

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4451

4452

#ifndef UNIT_BETA

4453

SCALE_BLOCK(1, float, bias, BETA);

4454

#endif // UNIT_BIAS

4455

4456

// c = c + bias[broadcasted]

4457

ADD_BLOCK_BROADCAST(4, c, bias0);

4458

4459

#else // defined(BROADCAST_BIAS)

4460

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4461

2) * src2_stride_z;

4462

4463

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4464

4465

#ifndef UNIT_BETA

4466

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4471

4472

#endif // defined(BROADCAST_BIAS)

4473

#endif // defined(BETA)

4474

4475

#if defined(ACTIVATION_TYPE)

4476

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4477

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4478

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4479

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4480

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4481

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4482

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4483

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4484

}

4485

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4486

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4487

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4488

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4489

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4490

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4491

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4492

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4493

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4494

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4495

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4496

* The activation function is performed after the bias addition

4497

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4498

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4499

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4500

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4501

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4502

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4503

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4504

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4505

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4506

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4507

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4508

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4509

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4510

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4511

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4512

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4513

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4514

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4515

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4516

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4517

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4518

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4519

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4520

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4521

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4522

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4523

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4524

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4525

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4526

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4527

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4528

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4529

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4530

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4531

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4532

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4533

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

4534

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4535

#if defined(BETA)

4536

IMAGE_DECLARATION(src2),

4537

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4538

IMAGE_DECLARATION(dst),

4539

uint src0_stride_z,

4540

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4541

#if defined(BETA)

4542

uint src2_stride_z,

4543

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4544

uint dst_stride_z

4545

#if defined(REINTERPRET_OUTPUT_AS_3D)

4546

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4547

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4548

#endif // REINTERPRET_OUTPUT_AS_3D

4549

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4550

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4551

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4552

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4553

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4554

4555

// Offset

4556

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4557

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

4558

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4559

// src_addr_a = address of matrix A

4560

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4561

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4562

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4563

4564

#if defined(MATRIX_B_DEPTH)

4565

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4566

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4567

#else // defined(MATRIX_B_DEPTH)

4568

src1_addr_in_bytes += z * src1_stride_z;

4569

#endif // defined(MATRIX_B_DEPTH)

4570

4571

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4572

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4573

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4574

src_addr_a += offset_row_a;

4575

src_addr_b += offset_row_b;

4576

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4577

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4582

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4583

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

4584

4585

int i = 0;

4586

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4587

{

4588

// Load values from matrix A (interleaved) and matrix B (transposed)

4589

float4 a0 = vload4(0, src_addr_a);

4590

float4 b0 = vload4(0, src_addr_b);

4591

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4592

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4593

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4594

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4595

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4596

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4597

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4598

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4599

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4600

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4601

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4602

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4603

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4604

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4605

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4606

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4607

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4608

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4609

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4610

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4611

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4612

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4613

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4614

4615

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4616

a0 = vload4(0, src_addr_a);

4617

b0 = vload4(0, src_addr_b);

4618

4619

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4620

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4621

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4622

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4623

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4624

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4625

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4626

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4627

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4628

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4629

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4630

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4631

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4632

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4633

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4634

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4635

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4636

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4637

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4638

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4639

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4640

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4641

4642

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4643

a0 = vload4(0, src_addr_a);

4644

b0 = vload4(0, src_addr_b);

4645

4646

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4647

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4648

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4649

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4650

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4651

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4652

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4653

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4654

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4655

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4656

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4657

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4659

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4660

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4661

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4662

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4663

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4664

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4665

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4666

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4667

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4668

4669

// Load values from matrix A (interleaved) and matrix B (transposed)

4670

a0 = vload4(0, src_addr_a);

4671

b0 = vload4(0, src_addr_b);

4672

4673

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4674

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4675

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4676

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4677

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4678

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4679

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4680

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4681

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4682

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4683

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4684

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4685

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4686

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4687

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4688

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4689

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4690

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4691

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4692

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4693

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4694

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4695

}

4696

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4697

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4698

{

4699

// Load values from matrix A (interleaved) and matrix B (transposed)

4700

float4 a0 = vload4(0, src_addr_a);

4701

float4 b0 = vload4(0, src_addr_b);

4702

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4703

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4704

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4705

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4706

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4707

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4708

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4709

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4710

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4711

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4712

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4713

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4714

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4715

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4716

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4717

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4718

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4719

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4720

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4721

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4722

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4723

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4724

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4725

}

4726

4727

// Compute destination address

4728

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4729

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4730

// Compute dst address

4731

__global uchar *dst_addr = offset(&dst, 0, 0);

4732

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4733

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4734

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4735

#if defined(REINTERPRET_OUTPUT_AS_3D)

4736

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4737

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4738

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4743

// |******************|

4744

// | cross_plane_pad |

4745

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4750

4751

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4752

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4753

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4754

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4755

// Add offset due to the cross plane paddings

4756

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4757

4758

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4759

// multiply dst_stride_z by DEPTH_GEMM3D

4760

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4761

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4762

// Add offset for batched GEMM

4763

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4764

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4765

4766

// Multiply by the weight of matrix-matrix product and store the result

4767

#if defined(ALPHA)

4768

SCALE_BLOCK(4, float, c, ALPHA);

4769

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4774

4775

#if defined(BROADCAST_BIAS)

4776

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4777

4778

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4779

4780

#ifndef UNIT_BETA

4781

SCALE_BLOCK(1, float, bias, BETA);

4782

#endif // UNIT_BIAS

4783

4784

// c = c + bias[broadcasted]

4785

ADD_BLOCK_BROADCAST(4, c, bias0);

4786

4787

#else // defined(BROADCAST_BIAS)

4788

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4789

2) * src2_stride_z;

4790

4791

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4792

4793

#ifndef UNIT_BETA

4794

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4799

4800

#endif // defined(BROADCAST_BIAS)

4801

#endif // defined(BETA)

4802

4803

#if defined(ACTIVATION_TYPE)

4804

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4805

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4806

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4807

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4808

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4809

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4810

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4811

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4812

}

4813

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4814

// Undefine local defines

4815

#undef COLS_MTX_B

4816

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4817

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4818

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4819

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4820

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4821

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4822

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4823

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4824

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4825

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4826

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4827

* The activation function is performed after the bias addition

4828

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4829

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4830

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4831

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4832

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4833

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4834

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4835

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4836

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4837

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4838

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4839

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4840

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4841

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4842

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4843

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4844

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4845

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4846

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4847

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4848

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4849

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4850

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4851

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4852

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4853

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4854

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4855

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4856

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4857

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4858

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4859

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4860

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4861

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4862

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4863

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4864

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

4865

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4866

#if defined(BETA)

4867

IMAGE_DECLARATION(src2),

4868

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4869

IMAGE_DECLARATION(dst),

4870

uint src0_stride_z,

4871

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4872

#if defined(BETA)

4873

uint src2_stride_z,

4874

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4875

uint dst_stride_z

4876

#if defined(REINTERPRET_OUTPUT_AS_3D)

4877

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4878

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4879

#endif // REINTERPRET_OUTPUT_AS_3D

4880

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4881

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4882

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4883

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4884

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4885

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4886

// Offset

4887

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4888

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4889

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4890

// src_addr_a = address of matrix A

4891

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4892

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4893

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4894

4895

#if defined(MATRIX_B_DEPTH)

4896

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4897

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4898

#else // defined(MATRIX_B_DEPTH)

4899

src1_addr_in_bytes += z * src1_stride_z;

4900

#endif // defined(MATRIX_B_DEPTH)

4901

4902

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4903

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4904

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4905

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4906

__global half *src_end_addr_b = src_addr_b + COLS_B;

4907

4908

src_addr_a += offset_row_a;

4909

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4910

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4911

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4916

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4917

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4918

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4919

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4920

half4 a0 = vload4(0, src_addr_a);

4921

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4922

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4923

c0 += (half8)a0.s0 * b0;

4924

c1 += (half8)a0.s1 * b0;

4925

c2 += (half8)a0.s2 * b0;

4926

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4927

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4928

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4929

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4930

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4931

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4932

c0 += (half8)a0.s0 * b0;

4933

c1 += (half8)a0.s1 * b0;

4934

c2 += (half8)a0.s2 * b0;

4935

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4936

}

4937

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4938

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4939

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4940

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4941

half4 a0 = vload4(0, src_addr_a);

4942

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4943

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4944

c0 += (half8)a0.s0 * b0;

4945

c1 += (half8)a0.s1 * b0;

4946

c2 += (half8)a0.s2 * b0;

4947

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4948

}

4949

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4950

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4951

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4952

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4953

// Compute dst address

4954

__global uchar *dst_addr = offset(&dst, 0, 0);

4955

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4956

uint4 zout = 0;

4957

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4958

#if defined(REINTERPRET_OUTPUT_AS_3D)

4959

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4960

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4961

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4966

// |******************|

4967

// | cross_plane_pad |

4968

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4973

4974

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4975

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4976

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4977

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4978

// Add offset due to the cross plane paddings

4979

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4980

4981

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4982

// multiply dst_stride_z by DEPTH_GEMM3D

4983

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4984

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4985

// Add offset for batched GEMM

4986

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4987

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4988

4989

// Multiply by the weight of matrix-matrix product and store the result

4990

#if defined(ALPHA)

4991

SCALE_BLOCK(4, half, c, ALPHA);

4992

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4997

4998

#if defined(BROADCAST_BIAS)

4999

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5000

5001

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5002

5003

#ifndef UNIT_BETA

5004

SCALE_BLOCK(1, half, bias, BETA);

5005

#endif // UNIT_BIAS

5006

5007

// c = c + bias[broadcasted]

5008

ADD_BLOCK_BROADCAST(4, c, bias0);

5009

5010

#else // defined(BROADCAST_BIAS)

5011

5012

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5013

2) * src2_stride_z;

5014

5015

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5016

5017

#ifndef UNIT_BETA

5018

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5023

5024

#endif // defined(BROADCAST_BIAS)

5025

#endif // defined(BETA)

5026

5027

#if defined(ACTIVATION_TYPE)

5028

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5029

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5030

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5031

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5032

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5033

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5034

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5035

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5036

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5037

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5038

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5039

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5040

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5041

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5042

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5043

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5044

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5045

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5046

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5047

* The activation function is performed after the bias addition

5048

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5049

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5050

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5051

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5052

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5053

*

5054

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5055

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5056

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5057

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5058

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5059

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5060

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5061

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5062

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5063

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5064

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5065

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5066

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5067

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5068

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5069

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5070

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5071

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5072

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5073

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5074

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5075

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5076

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5077

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5078

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5079

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5080

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5081

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5082

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5083

*/

5084

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

5085

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5086

#if defined(BETA)

5087

IMAGE_DECLARATION(src2),

5088

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5089

IMAGE_DECLARATION(dst),

5090

uint src0_stride_z,

5091

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5092

#if defined(BETA)

5093

uint src2_stride_z,

5094

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5095

uint dst_stride_z

5096

#if defined(REINTERPRET_OUTPUT_AS_3D)

5097

,

5098

uint cross_plane_pad

5099

#endif // REINTERPRET_OUTPUT_AS_3D

5100

)

5101

{

5102

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5103

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5104

int z = get_global_id(2);

5105

5106

// Offset

5107

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5108

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5109

5110

// src_addr_a = address of matrix A

5111

// src_addr_b = address of matrix B

5112

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5113

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5114

5115

#if defined(MATRIX_B_DEPTH)

5116

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5117

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5118

#else // defined(MATRIX_B_DEPTH)

5119

src1_addr_in_bytes += z * src1_stride_z;

5120

#endif // defined(MATRIX_B_DEPTH)

5121

5122

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5123

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5124

5125

// Compute end row address for matrix B

5126

__global half *src_end_addr_b = src_addr_b + COLS_B;

5127

5128

src_addr_a += offset_row_a;

5129

src_addr_b += offset_row_b;

5130

5131

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5136

5137

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

5138

{

5139

// Load values from matrix A (interleaved) and matrix B (transposed)

5140

float4 a0 = convert_float4(vload4(0, src_addr_a));

5141

float8 b0 = convert_float8(vload8(0, src_addr_b));

5142

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5143

c0 += (float8)a0.s0 * b0;

5144

c1 += (float8)a0.s1 * b0;

5145

c2 += (float8)a0.s2 * b0;

5146

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5147

5148

// Load values from matrix A (interleaved) and matrix B (transposed)

5149

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

5150

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

5151

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5152

c0 += (float8)a0.s0 * b0;

5153

c1 += (float8)a0.s1 * b0;

5154

c2 += (float8)a0.s2 * b0;

5155

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5156

}

5157

5158

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

5159

{

5160

// Load values from matrix A (interleaved) and matrix B (transposed)

5161

float4 a0 = convert_float4(vload4(0, src_addr_a));

5162

float8 b0 = convert_float8(vload8(0, src_addr_b));

5163

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5164

c0 += (float8)a0.s0 * b0;

5165

c1 += (float8)a0.s1 * b0;

5166

c2 += (float8)a0.s2 * b0;

5167

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5168

}

5169

5170

// Compute destination address

5171

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5172

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5173

// Compute dst address

5174

__global uchar *dst_addr = offset(&dst, 0, 0);

5175

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5176

uint4 zout = 0;

5177

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5178

#if defined(REINTERPRET_OUTPUT_AS_3D)

5179

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5180

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5186

// |******************|

5187

// | cross_plane_pad |

5188

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5193

5194

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5195

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5196

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5197

5198

// Add offset due to the cross plane paddings

5199

zout *= (cross_plane_pad * dst_stride_y);

5200

5201

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5202

// multiply dst_stride_z by DEPTH_GEMM3D

5203

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5204

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5205

// Add offset for batched GEMM

5206

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5207

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5208

5209

// Multiply by the weight of matrix-matrix product and store the result

5210

#if defined(ALPHA)

5211

SCALE_BLOCK(4, float, c, ALPHA);

5212

#endif // defined(ALPHA)

5213

5214

#if defined(BETA)

5215

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5216

5217

#if defined(BROADCAST_BIAS)

5218

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5219

5220

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5221

5222

float8 bias_f0 = convert_float8(bias0);

5223

5224

#ifndef UNIT_BETA

5225

SCALE_BLOCK(1, float, bias_f, BETA);

5226

#endif // UNIT_BIAS

5227

5228

// c = c + bias[broadcasted]

5229

ADD_BLOCK_BROADCAST(4, c, bias_f0);

5230

5231

#else // defined(BROADCAST_BIAS)

5232

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5233

2) * src2_stride_z;

5234

5235

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5236

5237

float8 bias_f0 = convert_float8(bias0);

5238

float8 bias_f1 = convert_float8(bias1);

5239

float8 bias_f2 = convert_float8(bias2);

5240

float8 bias_f3 = convert_float8(bias3);

5241

5242

#ifndef UNIT_BETA

5243

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

5248

5249

#endif // defined(BROADCAST_BIAS)

5250

#endif // defined(BETA)

5251

5252

half8 c_h0 = convert_half8(c0);

5253

half8 c_h1 = convert_half8(c1);

5254

half8 c_h2 = convert_half8(c2);

5255

half8 c_h3 = convert_half8(c3);

5256

5257

#if defined(ACTIVATION_TYPE)

5258

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

5259

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5260

5261

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5262

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5263

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5264

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5265

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5266

}

5267

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5268

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5269

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5270

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5271

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5272

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5273

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5274

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5275

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5276

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5277

* The activation function is performed after the bias addition

5278

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5279

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5280

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5281

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5282

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5283

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5284

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5285

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5286

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5287

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5288

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5289

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5290

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5291

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5292

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5293

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5294

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5295

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5296

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5297

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5298

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5299

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5300

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5301

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5302

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5303

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5304

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5305

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5306

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5307

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5308

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5309

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5310

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5311

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5312

*/

5313

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

5314

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5315

#if defined(BETA)

5316

IMAGE_DECLARATION(src2),

5317

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5318

IMAGE_DECLARATION(dst),

5319

uint src0_stride_z,

5320

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5321

#if defined(BETA)

5322

uint src2_stride_z,

5323

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5324

uint dst_stride_z

5325

#if defined(REINTERPRET_OUTPUT_AS_3D)

5326

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5327

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5328

#endif // REINTERPRET_OUTPUT_AS_3D

5329

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5330

{

5331

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5332

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5333

int z = get_global_id(2);

5334

5335

// Offset

5336

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5337

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5338

5339

// src_addr_a = address of matrix A

5340

// src_addr_b = address of matrix B

5341

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5342

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5343

5344

#if defined(MATRIX_B_DEPTH)

5345

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5346

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5347

#else // defined(MATRIX_B_DEPTH)

5348

src1_addr_in_bytes += z * src1_stride_z;

5349

#endif // defined(MATRIX_B_DEPTH)

5350

5351

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5352

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5353

5354

// Compute end row address for matrix B

5355

__global half *src_end_addr_b = src_addr_b + COLS_B;

5356

5357

src_addr_a += offset_row_a;

5358

src_addr_b += offset_row_b;

5359

5360

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5365

5366

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

5367

5368

int i = 0;

5369

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

5370

{

5371

#if MULT_INTERLEAVE4X4_HEIGHT == 1

5372

// Load values from matrix A (interleaved) and matrix B (transposed)

5373

half8 a0 = vload8(0, src_addr_a);

5374

half8 b0 = vload8(0, src_addr_b);

5375

5376

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5377

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5378

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5379

c0 = fma((half8)a0.s0, b0, c0);

5380

c1 = fma((half8)a0.s1, b0, c1);

5381

c2 = fma((half8)a0.s2, b0, c2);

5382

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5383

5384

// Load values from matrix B (transposed)

5385

b0 = vload8(0, src_addr_b);

5386

5387

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5388

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5389

c0 = fma((half8)a0.s4, b0, c0);

5390

c1 = fma((half8)a0.s5, b0, c1);

5391

c2 = fma((half8)a0.s6, b0, c2);

5392

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5393

5394

// Load values from matrix A (interleaved) and matrix B (transposed)

5395

a0 = vload8(0, src_addr_a);

5396

b0 = vload8(0, src_addr_b);

5397

5398

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5399

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5400

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5401

c0 = fma((half8)a0.s0, b0, c0);

5402

c1 = fma((half8)a0.s1, b0, c1);

5403

c2 = fma((half8)a0.s2, b0, c2);

5404

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5405

5406

// Load values from matrix B (transposed)

5407

b0 = vload8(0, src_addr_b);

5408

5409

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5410

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5411

c0 = fma((half8)a0.s4, b0, c0);

5412

c1 = fma((half8)a0.s5, b0, c1);

5413

c2 = fma((half8)a0.s6, b0, c2);

5414

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5415

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

5416

// Load values from matrix A (interleaved) and matrix B (transposed)

5417

half4 a0 = vload4(0, src_addr_a);

5418

half8 b0 = vload8(0, src_addr_b);

5419

5420

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5421

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5422

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5423

c0 = fma((half8)a0.s0, b0, c0);

5424

c1 = fma((half8)a0.s1, b0, c1);

5425

c2 = fma((half8)a0.s2, b0, c2);

5426

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5427

5428

// Load values from matrix A (interleaved) and matrix B (transposed)

5429

a0 = vload4(0, src_addr_a);

5430

b0 = vload8(0, src_addr_b);

5431

5432

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5433

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5434

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5435

c0 = fma((half8)a0.s0, b0, c0);

5436

c1 = fma((half8)a0.s1, b0, c1);

5437

c2 = fma((half8)a0.s2, b0, c2);

5438

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5439

5440

// Load values from matrix A (interleaved) and matrix B (transposed)

5441

a0 = vload4(0, src_addr_a);

5442

b0 = vload8(0, src_addr_b);

5443

5444

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5445

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5446

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5447

c0 = fma((half8)a0.s0, b0, c0);

5448

c1 = fma((half8)a0.s1, b0, c1);

5449

c2 = fma((half8)a0.s2, b0, c2);

5450

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5451

5452

// Load values from matrix A (interleaved) and matrix B (transposed)

5453

a0 = vload4(0, src_addr_a);

5454

b0 = vload8(0, src_addr_b);

5455

5456

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5457

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5458

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5459

c0 = fma((half8)a0.s0, b0, c0);

5460

c1 = fma((half8)a0.s1, b0, c1);

5461

c2 = fma((half8)a0.s2, b0, c2);

5462

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5463

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

5464

}

5465

5466

for(; i < (int)(COLS_MTX_B); ++i)

5467

{

5468

// Load values from matrix A (interleaved) and matrix B (transposed)

5469

half4 a0 = vload4(0, src_addr_a);

5470

half8 b0 = vload8(0, src_addr_b);

5471

5472

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5473

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5474

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5475

c0 = fma((half8)a0.s0, b0, c0);

5476

c1 = fma((half8)a0.s1, b0, c1);

5477

c2 = fma((half8)a0.s2, b0, c2);

5478

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5479

}

5480

5481

// Compute destination address

5482

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5483

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5484

// Compute dst address

5485

__global uchar *dst_addr = offset(&dst, 0, 0);

5486

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5487

uint4 zout = 0;

5488

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5489

#if defined(REINTERPRET_OUTPUT_AS_3D)

5490

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5491

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5492

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5497

// |******************|

5498

// | cross_plane_pad |

5499

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5504

5505

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5506

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5507

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5508

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5509

// Add offset due to the cross plane paddings

5510

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5511

5512

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5513

// multiply dst_stride_z by DEPTH_GEMM3D

5514

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5515

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5516

// Add offset for batched GEMM

5517

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5518

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5519

5520

// Multiply by the weight of matrix-matrix product and store the result

5521

#if defined(ALPHA)

5522

SCALE_BLOCK(4, half, c, ALPHA);

5523

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5528

5529

#if defined(BROADCAST_BIAS)

5530

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5531

5532

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5533

5534

#ifndef UNIT_BETA

5535

SCALE_BLOCK(1, half, bias, BETA);

5536

#endif // UNIT_BIAS

5537

5538

// c = c + bias[broadcasted]

5539

ADD_BLOCK_BROADCAST(4, c, bias0);

5540

5541

#else // defined(BROADCAST_BIAS)

5542

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5543

2) * src2_stride_z;

5544

5545

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5546

5547

#ifndef UNIT_BETA

5548

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5553

5554

#endif // defined(BROADCAST_BIAS)

5555

#endif // defined(BETA)

5556

5557

#if defined(ACTIVATION_TYPE)

5558

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5559

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5560

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5561

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5562

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5563

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5564

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5565

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5566

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

5567

5568

// Undefine local defines

5569

#undef COLS_MTX_B

5570

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

5571

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5572

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

5573

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5574

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5575

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

5576

#if defined(DATA_TYPE)

5577

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5578

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

5579

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5580

* @note This OpenCL kernel works with floating point data types (F16/F32)

5581

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

5582

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5583

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5584

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5585

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5586

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5587

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5588

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5589

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5590

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5591

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5592

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5593

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5594

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5595

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5596

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5597

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5598

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5599

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5600

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5601

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5602

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5603

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5604

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5605

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5606

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5607

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5608

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5609

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5610

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5611

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5612

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5613

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5614

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5615

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5616

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5617

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5618

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5619

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5620

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5621

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5622

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5623

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5624

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5625

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5626

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5627

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

5628

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5629

#if defined(BETA)

5630

IMAGE_DECLARATION(src2),

5631

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5632

IMAGE_DECLARATION(dst),

5633

uint src0_stride_z,

5634

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5635

#if defined(BETA)

5636

uint src2_stride_z,

5637

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5638

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5639

#if defined(REINTERPRET_INPUT_AS_3D)

5640

,

5641

uint src_cross_plane_pad

5642

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5643

#if defined(REINTERPRET_OUTPUT_AS_3D)

5644

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5645

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5646

#endif // REINTERPRET_OUTPUT_AS_3D

5647

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5648

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5649

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5650

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5651

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5652

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5653

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5654

// Update address for the matrix A

5655

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5656

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5657

// Update address for the matrix B

5658

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5659

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5660

#if defined(REINTERPRET_INPUT_AS_3D)

5661

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5662

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5668

// |******************|

5669

// | cross_plane_pad |

5670

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5675

5676

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5677

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5678

zin = min(DEPTH_GEMM3D - 1, zin);

5679

5680

// Add offset due to the cross plane paddings

5681

zin *= (src_cross_plane_pad * src0_stride_y);

5682

5683

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5684

// multiply src0_stride_z by DEPTH_GEMM3D

5685

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5686

5687

#else // defined(REINTERPRET_INPUT_AS_3D)

5688

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5689

// Add offset for batched GEMM

5690

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5691

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5692

#endif // defined(REINTERPRET_INPUT_AS_3D)

5693

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5694

#if defined(MATRIX_B_DEPTH)

5695

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5696

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5697

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5698

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5699

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5700

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5701

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

5702

5703

VECTOR_TYPE acc0 = 0.0f;

5704

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5705

VECTOR_TYPE acc1 = 0.0f;

5706

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5707

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5708

VECTOR_TYPE acc2 = 0.0f;

5709

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5710

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5711

VECTOR_TYPE acc3 = 0.0f;

5712

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5713

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

5714

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5715

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5716

#if defined(REINTERPRET_INPUT_AS_3D)

5717

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5718

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5719

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5720

// Load values from matrix A

5721

VEC_DATA_TYPE(DATA_TYPE, 2)

5722

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5723

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5724

VEC_DATA_TYPE(DATA_TYPE, 2)

5725

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5726

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5727

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5728

VEC_DATA_TYPE(DATA_TYPE, 2)

5729

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5730

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5731

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5732

VEC_DATA_TYPE(DATA_TYPE, 2)

5733

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5734

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5735

#endif // defined(REINTERPRET_INPUT_AS_3D)

5736

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5737

// Load values from matrix B

5738

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

5739

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5740

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5741

// Accumulate

5742

acc0 += b0 * (VECTOR_TYPE)a0.s0;

5743

acc0 += b1 * (VECTOR_TYPE)a0.s1;

5744

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5745

acc1 += b0 * (VECTOR_TYPE)a1.s0;

5746

acc1 += b1 * (VECTOR_TYPE)a1.s1;

5747

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5748

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5749

acc2 += b0 * (VECTOR_TYPE)a2.s0;

5750

acc2 += b1 * (VECTOR_TYPE)a2.s1;

5751

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5752

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5753

acc3 += b0 * (VECTOR_TYPE)a3.s0;

5754

acc3 += b1 * (VECTOR_TYPE)a3.s1;

5755

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5756

}

5757

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5758

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5759

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5760

#if defined(REINTERPRET_INPUT_AS_3D)

5761

// Load values from matrix A

5762

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5763

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5764

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5765

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5766

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5767

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5768

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5769

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5770

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5771

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5772

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5773

// Load values from matrix A

5774

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5775

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5776

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5777

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5778

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5779

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5780

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5781

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5782

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5783

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5784

#endif // defined(REINTERPRET_INPUT_AS_3D)

5785

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5786

// Load values from matrix B

5787

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5788

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5789

// Accumulate

5790

acc0 += b0 * (VECTOR_TYPE)a0;

5791

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5792

acc1 += b0 * (VECTOR_TYPE)a1;

5793

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5794

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5795

acc2 += b0 * (VECTOR_TYPE)a2;

5796

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5797

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5798

acc3 += b0 * (VECTOR_TYPE)a3;

5799

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5800

}

5801

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5802

int z = get_global_id(2);

5803

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5804

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5805

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5806

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5807

// Compute dst address

5808

__global uchar *dst_addr = offset(&dst, 0, 0);

5809

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5810

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5811

5812

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5813

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5814

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5815

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5816

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5821

// |******************|

5822

// | cross_plane_pad |

5823

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5828

5829

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5830

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5831

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5832

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5833

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5834

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5835

5836

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5837

// multiply dst_stride_z by DEPTH_GEMM3D

5838

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5839

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5840

// Add offset for batched GEMM

5841

dst_addr += z * dst_stride_z;

5842

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5843

5844

// Multiply by the weight of matrix-matrix product and store the result

5845

#if defined(ALPHA)

5846

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

5847

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5852

5853

#if defined(BROADCAST_BIAS)

5854

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

5855

5856

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5857

5858

#ifndef UNIT_BETA

5859

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

5860

#endif // UNIT_BIAS

5861

5862

// c = c + bias[broadcasted]

5863

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5864

5865

#else // defined(BROADCAST_BIAS)

5866

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

5867

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5868

5869

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5870

5871

#ifndef UNIT_BETA

5872

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5877

5878

#endif // defined(BROADCAST_BIAS)

5879

#endif // defined(BETA)

5880

5881

#if defined(ACTIVATION_TYPE)

5882

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

5883

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5884

5885

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5886

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5887

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5888

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5889

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

5890

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5891

*

5892

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5893

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5894

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5895

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5896

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5897

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5898

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5899

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5900

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5901

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5902

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5903

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5904

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5905

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5906

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5907

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5908

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5909

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5910

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5911

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5912

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5913

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5914

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5915

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5916

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5917

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5918

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5919

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5920

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5921

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5922

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5923

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5924

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5925

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5926

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5927

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5928

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5929

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5930

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5931

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5932

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5933

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5934

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5935

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5936

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5937

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5938

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5939

*/

5940

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

5941

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5942

#if defined(BETA)

5943

IMAGE_DECLARATION(src2),

5944

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5945

IMAGE_DECLARATION(dst),

5946

uint src0_stride_z,

5947

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5948

#if defined(BETA)

5949

uint src2_stride_z,

5950

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5951

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5952

#if defined(REINTERPRET_INPUT_AS_3D)

5953

,

5954

uint src_cross_plane_pad

5955

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5956

#if defined(REINTERPRET_OUTPUT_AS_3D)

5957

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5958

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5959

#endif // REINTERPRET_OUTPUT_AS_3D

5960

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5961

{

5962

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5963

5964

// Compute starting address for matrix A and matrix B

5965

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5966

5967

// Update address for matrix A

5968

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5969

5970

// Update address for matrix B

5971

src_addr.s1 += idx * sizeof(float);

5972

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5973

#if defined(REINTERPRET_INPUT_AS_3D)

5974

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5975

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5981

// |******************|

5982

// | cross_plane_pad |

5983

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5988

5989

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5990

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5991

zin = min(DEPTH_GEMM3D - 1, zin);

5992

5993

// Add offset due to the cross plane paddings

5994

zin *= (src_cross_plane_pad * src0_stride_y);

5995

5996

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5997

// multiply src0_stride_z by DEPTH_GEMM3D

5998

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5999

6000

#else // defined(REINTERPRET_INPUT_AS_3D)

6001

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6002

// Add offset for batched GEMM

6003

src_addr.s0 += get_global_id(2) * src0_stride_z;

6004

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6005

#endif // defined(REINTERPRET_INPUT_AS_3D)

6006

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6007

#if defined(MATRIX_B_DEPTH)

6008

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6009

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6010

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6011

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6012

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6013

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6014

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6015

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6016

6017

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6018

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6019

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6020

6021

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6022

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6023

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6024

6025

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6026

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6027

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6028

6029

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6030

int i = 0;

6031

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6032

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6033

#if defined(REINTERPRET_INPUT_AS_3D)

6034

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6035

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6036

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6037

// Load values from matrix A and matrix B

6038

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6039

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6040

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6041

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6042

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6043

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6044

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6045

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6046

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6047

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6048

#endif // defined(REINTERPRET_INPUT_AS_3D)

6049

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6050

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6051

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6052

6053

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6054

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6055

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6056

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

6057

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6058

6059

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6060

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6061

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

6062

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

6063

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

6064

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6065

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6066

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6067

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6068

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6069

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

6070

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

6071

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

6072

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6073

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6074

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6075

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6076

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6077

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

6078

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

6079

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

6080

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6081

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6082

6083

// Load values from matrix A and matrix B

6084

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6085

src_addr.s1 += src1_stride_y;

6086

6087

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6088

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

6089

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

6090

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

6091

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6092

6093

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6094

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6095

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

6096

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

6097

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

6098

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6099

6100

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6101

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6102

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6103

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

6104

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

6105

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

6106

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6107

6108

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6109

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6110

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6111

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

6112

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

6113

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

6114

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6115

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6116

6117

// Load values from matrix A and matrix B

6118

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6119

src_addr.s1 += src1_stride_y;

6120

6121

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6122

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

6123

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

6124

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

6125

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6126

6127

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6128

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6129

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

6130

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

6131

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

6132

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6133

6134

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6135

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6136

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6137

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

6138

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

6139

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

6140

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6141

6142

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6143

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6144

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6145

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

6146

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

6147

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

6148

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6149

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6150

6151

// Load values from matrix A and matrix B

6152

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6153

src_addr.s1 += src1_stride_y;

6154

6155

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6156

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

6157

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

6158

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

6159

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6160

6161

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6162

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6163

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

6164

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

6165

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

6166

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6167

6168

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6169

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6170

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6171

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

6172

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

6173

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

6174

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6175

6176

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6177

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6178

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6179

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

6180

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

6181

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

6182

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6183

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6184

6185

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6186

}

6187

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6188

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6189

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6190

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6191

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6192

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6193

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6194

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6195

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6196

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6197

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6198

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6199

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6200

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6201

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6202

#else // defined(REINTERPRET_INPUT_AS_3D)

6203

// Load values from matrix A

6204

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6205

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6206

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6207

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6208

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6209

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6210

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6211

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6212

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6213

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6214

#endif // defined(REINTERPRET_INPUT_AS_3D)

6215

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6216

// Load values from matrix B

6217

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6218

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6219

6220

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6221

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6222

acc0.s1 = fma(a0, b0.s1, acc0.s1);

6223

acc0.s2 = fma(a0, b0.s2, acc0.s2);

6224

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6226

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6227

acc1.s1 = fma(a1, b0.s1, acc1.s1);

6228

acc1.s2 = fma(a1, b0.s2, acc1.s2);

6229

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6230

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6231

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6232

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6233

acc2.s1 = fma(a2, b0.s1, acc2.s1);

6234

acc2.s2 = fma(a2, b0.s2, acc2.s2);

6235

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6236

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6237

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6238

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6239

acc3.s1 = fma(a3, b0.s1, acc3.s1);

6240

acc3.s2 = fma(a3, b0.s2, acc3.s2);

6241

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6242

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6243

6244

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6245

}

6246

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6247

int z = get_global_id(2);

6248

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6249

// Compute destination address

6250

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6251

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6252

// Compute dst address

6253

__global uchar *dst_addr = offset(&dst, 0, 0);

6254

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6255

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6256

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6257

#if defined(REINTERPRET_OUTPUT_AS_3D)

6258

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6259

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6260

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6265

// |******************|

6266

// | cross_plane_pad |

6267

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6272

6273

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6274

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6275

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6276

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6277

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6278

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6279

6280

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6281

// multiply dst_stride_z by DEPTH_GEMM3D

6282

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6283

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6284

// Add offset for batched GEMM

6285

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6286

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6287

6288

// Multiply by the weight of matrix-matrix product and store the result

6289

#if defined(ALPHA)

6290

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6291

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6296

6297

#if defined(BROADCAST_BIAS)

6298

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

6299

6300

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6301

6302

#ifndef UNIT_BETA

6303

SCALE_BLOCK(1, float, bias, BETA);

6304

#endif // UNIT_BIAS

6305

6306

// acc = acc + bias[broadcasted]

6307

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6308

6309

#else // defined(BROADCAST_BIAS)

6310

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

6311

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6312

6313

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6314

6315

#ifndef UNIT_BETA

6316

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6321

6322

#endif // defined(BROADCAST_BIAS)

6323

#endif // defined(BETA)

6324

6325

#if defined(ACTIVATION_TYPE)

6326

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6327

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6328

6329

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6330

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6331

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6332

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6333

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6334

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6335

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6337

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6338

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6339

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6340

}

6341

6342

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

6343

*

6344

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

6345

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

6346

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6347

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

6348

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6349

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6350

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6351

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6352

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6353

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6354

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6355

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6356

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6357

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6358

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6359

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6360

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6361

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6362

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6363

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6364

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6365

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6366

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6367

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6368

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6369

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6370

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6371

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6372

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6373

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6374

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6375

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6376

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6377

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6378

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6379

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6380

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6381

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6382

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6383

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6384

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6385

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6386

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6387

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6388

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6389

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6390

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6391

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6392

*/

6393

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

6394

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6395

#if defined(BETA)

6396

IMAGE_DECLARATION(src2),

6397

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6398

IMAGE_DECLARATION(dst),

6399

uint src0_stride_z,

6400

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6401

#if defined(BETA)

6402

uint src2_stride_z,

6403

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6404

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6405

#if defined(REINTERPRET_INPUT_AS_3D)

6406

,

6407

uint src_cross_plane_pad

6408

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6409

#if defined(REINTERPRET_OUTPUT_AS_3D)

6410

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6411

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6412

#endif // REINTERPRET_OUTPUT_AS_3D

6413

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6414

{

6415

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6416

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6417

6418

// Compute starting address for matrix A and Matrix B

6419

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6420

6421

// Update address for the matrix A

6422

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6423

6424

// Update address for the matrix B

6425

src_addr.s1 += idx * sizeof(float);

6426

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6427

#if defined(REINTERPRET_INPUT_AS_3D)

6428

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6429

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6435

// |******************|

6436

// | cross_plane_pad |

6437

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6442

6443

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6444

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6445

zin = min(DEPTH_GEMM3D - 1, zin);

6446

6447

// Add offset due to the cross plane paddings

6448

zin *= (src_cross_plane_pad * src0_stride_y);

6449

6450

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6451

// multiply src0_stride_z by DEPTH_GEMM3D

6452

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6453

6454

#else // defined(REINTERPRET_INPUT_AS_3D)

6455

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6456

// Add offset for batched GEMM

6457

src_addr.s0 += get_global_id(2) * src0_stride_z;

6458

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6459

#endif // defined(REINTERPRET_INPUT_AS_3D)

6460

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6461

#if defined(MATRIX_B_DEPTH)

6462

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6463

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6464

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6465

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6466

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6467

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6468

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6469

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6470

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6471

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6472

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6473

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6474

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6475

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6476

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6477

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6478

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6479

6480

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6481

int i = 0;

6482

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6483

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6484

#if defined(REINTERPRET_INPUT_AS_3D)

6485

// Load values from matrix A

6486

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

6487

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6488

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6489

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6490

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6491

6492

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6493

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6494

src_addr.s1 += src1_stride_y;

6495

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6496

src_addr.s1 += src1_stride_y;

6497

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6498

src_addr.s1 += src1_stride_y;

6499

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6500

src_addr.s1 += src1_stride_y;

6501

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6502

src_addr.s1 += src1_stride_y;

6503

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6504

src_addr.s1 += src1_stride_y;

6505

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6506

src_addr.s1 += src1_stride_y;

6507

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6508

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6509

6510

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6511

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6512

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

6513

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

6514

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

6515

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

6516

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

6517

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

6518

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6519

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6520

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6521

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

6522

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

6523

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

6524

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

6525

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

6526

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

6527

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6528

6529

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6530

#if defined(REINTERPRET_INPUT_AS_3D)

6531

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6532

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6533

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6534

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6535

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

6536

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

6537

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

6538

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

6539

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

6540

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

6541

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

6542

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6543

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6544

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

6545

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

6546

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

6547

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

6548

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

6549

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

6550

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

6551

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6552

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6553

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6554

#if defined(REINTERPRET_INPUT_AS_3D)

6555

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6556

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6557

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6558

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6559

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

6560

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

6561

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

6562

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

6563

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

6564

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

6565

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

6566

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6567

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6568

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

6569

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

6570

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

6571

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

6572

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

6573

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

6574

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

6575

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6576

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6577

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6578

#if defined(REINTERPRET_INPUT_AS_3D)

6579

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6580

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6581

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6582

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6583

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

6584

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

6585

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

6586

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

6587

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

6588

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

6589

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

6590

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6591

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6592

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

6593

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

6594

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

6595

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

6596

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

6597

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

6598

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

6599

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6600

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6601

6602

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6603

}

6604

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6605

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6606

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6607

#if defined(REINTERPRET_INPUT_AS_3D)

6608

// Load values from matrix A

6609

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6610

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6611

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6612

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6613

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6614

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6615

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6616

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6617

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6618

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6619

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6620

// Load values from matrix A

6621

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6622

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6623

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6624

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6625

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6626

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6627

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6628

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6629

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6630

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6631

#endif // defined(REINTERPRET_INPUT_AS_3D)

6632

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6633

// Load values from matrix B

6634

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6635

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6636

6637

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6638

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6639

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6640

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6641

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6642

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6643

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6644

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6645

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6646

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6647

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6648

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6649

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6650

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6651

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6652

6653

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6654

}

6655

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6656

int z = get_global_id(2);

6657

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6658

// Compute destination address

6659

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6660

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6661

// Compute dst address

6662

__global uchar *dst_addr = offset(&dst, 0, 0);

6663

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6664

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6665

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6666

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6667

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6668

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6669

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6670

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6675

// |******************|

6676

// | cross_plane_pad |

6677

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6682

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6683

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6684

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6685

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6686

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6687

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6688

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6689

6690

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6691

// multiply dst_stride_z by DEPTH_GEMM3D

6692

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6693

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6694

// Add offset for batched GEMM

6695

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6696

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6697

6698

// Multiply by the weight of matrix-matrix product and store the result

6699

#if defined(ALPHA)

6700

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6701

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6706

6707

#if defined(BROADCAST_BIAS)

6708

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

6709

6710

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6711

6712

#ifndef UNIT_BETA

6713

SCALE_BLOCK(1, float, bias, BETA);

6714

#endif // UNIT_BIAS

6715

6716

// acc = acc + bias[broadcasted]

6717

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6718

6719

#else // defined(BROADCAST_BIAS)

6720

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

6721

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6722

6723

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6724

6725

#ifndef UNIT_BETA

6726

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6731

6732

#endif // defined(BROADCAST_BIAS)

6733

#endif // defined(BETA)

6734

6735

#if defined(ACTIVATION_TYPE)

6736

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6737

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6738

6739

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6740

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6741

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6742

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6743

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6744

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6745

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6746

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6747

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6748

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6749

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6750

}

6751

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6752

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6753

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

6754

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6755

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

6756

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6757

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

6758

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6759

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6760

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6761

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6762

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6763

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6764

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6765

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6766

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

6767

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6768

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6769

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6770

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6771

*

6772

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

6773

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6774

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6775

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6776

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6777

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6778

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6779

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6780

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6781

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6782

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6783

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6784

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6785

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6786

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6787

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6788

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6789

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6790

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6791

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6792

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6793

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6794

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6795

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6796

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6797

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6798

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6799

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6800

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6801

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

6802

*/

6803

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

6804

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6805

#if defined(BETA)

6806

IMAGE_DECLARATION(src2),

6807

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6808

IMAGE_DECLARATION(dst),

6809

uint src0_stride_z,

6810

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6811

#if defined(BETA)

6812

uint src2_stride_z,

6813

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6814

uint dst_stride_z

6815

#if defined(REINTERPRET_INPUT_AS_3D)

6816

,

6817

uint src_cross_plane_pad

6818

#endif // REINTERPRET_INPUT_AS_3D

6819

#if defined(REINTERPRET_OUTPUT_AS_3D)

6820

,

6821

uint dst_cross_plane_pad

6822

#endif // REINTERPRET_OUTPUT_AS_3D

6823

)

6824

{

6825

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6826

6827

// Compute starting address for matrix A and Matrix B

6828

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6829

6830

// Update address for the matrix A

6831

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6832

6833

// Update address for the matrix B

6834

src_addr.s1 += idx * sizeof(half);

6835

6836

#if defined(REINTERPRET_INPUT_AS_3D)

6837

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6838

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6844

// |******************|

6845

// | cross_plane_pad |

6846

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6851

6852

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6853

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6854

zin = min(DEPTH_GEMM3D - 1, zin);

6855

6856

// Add offset due to the cross plane paddings

6857

zin *= (src_cross_plane_pad * src0_stride_y);

6858

6859

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6860

// multiply src0_stride_z by DEPTH_GEMM3D

6861

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6862

6863

#else // defined(REINTERPRET_INPUT_AS_3D)

6864

6865

// Add offset for batched GEMM

6866

src_addr.s0 += get_global_id(2) * src0_stride_z;

6867

6868

#endif // defined(REINTERPRET_INPUT_AS_3D)

6869

6870

#if defined(MATRIX_B_DEPTH)

6871

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6872

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6873

#else // defined(MATRIX_B_DEPTH)

6874

src_addr.s1 += get_global_id(2) * src1_stride_z;

6875

#endif // defined(MATRIX_B_DEPTH)

6876

6877

float8 acc0 = 0.0h;

6878

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6879

float8 acc1 = 0.0h;

6880

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6881

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6882

float8 acc2 = 0.0h;

6883

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6884

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6885

float8 acc3 = 0.0h;

6886

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6887

6888

int i = 0;

6889

for(; i <= ((int)COLS_A - 4); i += 4)

6890

{

6891

#if defined(REINTERPRET_INPUT_AS_3D)

6892

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6893

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6894

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6895

// Load values from matrix A

6896

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6897

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6898

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6899

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6900

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6901

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6902

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6903

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6904

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6905

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6906

#endif // defined(REINTERPRET_INPUT_AS_3D)

6907

6908

// Load values from matrix B

6909

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6910

src_addr.s1 += src1_stride_y;

6911

6912

// Accumulate

6913

acc0 = fma(b0, (float8)a0.s0, acc0);

6914

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6915

acc1 = fma(b0, (float8)a1.s0, acc1);

6916

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6917

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6918

acc2 = fma(b0, (float8)a2.s0, acc2);

6919

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6920

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6921

acc3 = fma(b0, (float8)a3.s0, acc3);

6922

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6923

6924

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6925

src_addr.s1 += src1_stride_y;

6926

acc0 = fma(b0, (float8)a0.s1, acc0);

6927

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6928

acc1 = fma(b0, (float8)a1.s1, acc1);

6929

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6930

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6931

acc2 = fma(b0, (float8)a2.s1, acc2);

6932

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6933

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6934

acc3 = fma(b0, (float8)a3.s1, acc3);

6935

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6936

6937

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6938

src_addr.s1 += src1_stride_y;

6939

acc0 = fma(b0, (float8)a0.s2, acc0);

6940

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6941

acc1 = fma(b0, (float8)a1.s2, acc1);

6942

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6943

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6944

acc2 = fma(b0, (float8)a2.s2, acc2);

6945

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6946

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6947

acc3 = fma(b0, (float8)a3.s2, acc3);

6948

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6949

6950

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6951

src_addr.s1 += src1_stride_y;

6952

acc0 = fma(b0, (float8)a0.s3, acc0);

6953

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6954

acc1 = fma(b0, (float8)a1.s3, acc1);

6955

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6956

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6957

acc2 = fma(b0, (float8)a2.s3, acc2);

6958

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6959

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6960

acc3 = fma(b0, (float8)a3.s3, acc3);

6961

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6962

6963

src_addr.s0 += 4 * sizeof(half);

6964

}

6965

6966

for(; i < (int)COLS_A; ++i)

6967

{

6968

#if defined(REINTERPRET_INPUT_AS_3D)

6969

// Load values from matrix A

6970

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6971

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6972

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6973

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6974

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6975

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6976

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6977

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6978

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6979

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6980

#else // defined(REINTERPRET_INPUT_AS_3D)

6981

// Load values from matrix A

6982

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6983

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6984

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6985

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6986

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6987

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6988

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6989

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6990

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6991

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6992

#endif // defined(REINTERPRET_INPUT_AS_3D)

6993

6994

// Load values from matrix B

6995

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6996

6997

src_addr += (int2)(sizeof(half), src1_stride_y);

6998

6999

// Accumulate

7000

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

7001

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7002

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

7003

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7004

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7005

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

7006

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7007

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7008

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

7009

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7010

}

7011

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7012

int z = get_global_id(2);

7013

7014

// Compute destination address

7015

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7016

7017

// Compute dst address

7018

__global uchar *dst_addr = offset(&dst, 0, 0);

7019

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7020

uint4 zout = 0;

7021

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7022

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7023

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7024

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

7025

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7031

// |******************|

7032

// | cross_plane_pad |

7033

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7038

7039

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7040

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7041

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7042

7043

// Add offset due to the cross plane paddings

7044

zout *= (dst_cross_plane_pad * dst_stride_y);

7045

7046

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7047

// multiply dst_stride_z by DEPTH_GEMM3D

7048

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7049

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7050

// Add offset for batched GEMM

7051

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7052

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7053

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7054

// Multiply by the weight of matrix-matrix product and store the result

7055

#if defined(ALPHA)

7056

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

7057

#endif // defined(ALPHA)

7058

7059

#if defined(BETA)

7060

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7061

7062

#if defined(BROADCAST_BIAS)

7063

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7064

7065

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7066

7067

float8 bias_f0 = convert_float8(bias0);

7068

7069

#ifndef UNIT_BETA

7070

SCALE_BLOCK(1, float, bias_f, BETA);

7071

#endif // UNIT_BIAS

7072

7073

// acc = acc + bias[broadcasted]

7074

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

7075

7076

#else // defined(BROADCAST_BIAS)

7077

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7078

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7079

7080

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7081

7082

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7083

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7084

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7085

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7086

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7087

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7088

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7089

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7090

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7091

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7092

7093

#ifndef UNIT_BETA

7094

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

7099

7100

#endif // defined(BROADCAST_BIAS)

7101

#endif // defined(BETA)

7102

7103

half8 acc_h0 = convert_half8(acc0);

7104

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7105

half8 acc_h1 = convert_half8(acc1);

7106

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7107

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7108

half8 acc_h2 = convert_half8(acc2);

7109

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7110

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7111

half8 acc_h3 = convert_half8(acc3);

7112

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7113

7114

#if defined(ACTIVATION_TYPE)

7115

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

7116

#endif // defined(ACTIVATION_TYPE)

7117

7118

// Store the output block

7119

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7120

}

7121

7122

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

7123

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7124

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

7125

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

7126

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

7127

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

7128

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7129

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

7130

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7131

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7132

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

7133

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7134

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

7135

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7136

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

7137

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

7138

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

7139

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

7140

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7141

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

7142

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7143

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7144

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7145

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7146

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

7147

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

7148

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7149

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7150

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7151

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7152

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7153

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

7154

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

7155

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

7156

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

7157

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

7158

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7159

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

7160

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7161

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7162

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7163

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7164

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7165

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

7166

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7167

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7168

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7169

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

7170

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7171

*/

7172

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

7173

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7174

#if defined(BETA)

7175

IMAGE_DECLARATION(src2),

7176

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7177

IMAGE_DECLARATION(dst),

7178

uint src0_stride_z,

7179

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7180

#if defined(BETA)

7181

uint src2_stride_z,

7182

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7183

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7184

#if defined(REINTERPRET_INPUT_AS_3D)

7185

,

7186

uint src_cross_plane_pad

7187

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7188

#if defined(REINTERPRET_OUTPUT_AS_3D)

7189

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7190

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7191

#endif // REINTERPRET_OUTPUT_AS_3D

7192

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7193

{

7194

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

7195

7196

// Compute starting address for matrix A and Matrix B

7197

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

7198

7199

// Update address for the matrix A

7200

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

7201

7202

// Update address for the matrix B

7203

src_addr.s1 += idx * sizeof(half);

7204

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7205

#if defined(REINTERPRET_INPUT_AS_3D)

7206

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

7207

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7213

// |******************|

7214

// | cross_plane_pad |

7215

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7220

7221

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

7222

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7223

zin = min(DEPTH_GEMM3D - 1, zin);

7224

7225

// Add offset due to the cross plane paddings

7226

zin *= (src_cross_plane_pad * src0_stride_y);

7227

7228

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7229

// multiply src0_stride_z by DEPTH_GEMM3D

7230

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

7231

7232

#else // defined(REINTERPRET_INPUT_AS_3D)

7233

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7234

// Add offset for batched GEMM

7235

src_addr.s0 += get_global_id(2) * src0_stride_z;

7236

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7237

#endif // defined(REINTERPRET_INPUT_AS_3D)

7238

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7239

#if defined(MATRIX_B_DEPTH)

7240

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

7241

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

7242

#else // defined(MATRIX_B_DEPTH)

7243

src_addr.s1 += get_global_id(2) * src1_stride_z;

7244

#endif // defined(MATRIX_B_DEPTH)

7245

7246

half8 acc0 = 0.0h;

7247

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7248

half8 acc1 = 0.0h;

7249

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7250

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7251

half8 acc2 = 0.0h;

7252

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7253

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7254

half8 acc3 = 0.0h;

7255

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7256

7257

int i = 0;

7258

for(; i <= ((int)COLS_A - 4); i += 4)

7259

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7260

#if defined(REINTERPRET_INPUT_AS_3D)

7261

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7262

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

7263

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7264

// Load values from matrix A

7265

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7266

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7267

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7268

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7269

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7270

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7271

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7272

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7273

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7274

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7275

#endif // defined(REINTERPRET_INPUT_AS_3D)

7276

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7277

// Load values from matrix B

7278

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7279

src_addr.s1 += src1_stride_y;

7280

7281

// Accumulate

7282

acc0 = fma(b0, (half8)a0.s0, acc0);

7283

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7284

acc1 = fma(b0, (half8)a1.s0, acc1);

7285

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7286

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7287

acc2 = fma(b0, (half8)a2.s0, acc2);

7288

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7289

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7290

acc3 = fma(b0, (half8)a3.s0, acc3);

7291

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7292

7293

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7294

src_addr.s1 += src1_stride_y;

7295

acc0 = fma(b0, (half8)a0.s1, acc0);

7296

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7297

acc1 = fma(b0, (half8)a1.s1, acc1);

7298

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7299

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7300

acc2 = fma(b0, (half8)a2.s1, acc2);

7301

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7302

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7303

acc3 = fma(b0, (half8)a3.s1, acc3);

7304

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7305

7306

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7307

src_addr.s1 += src1_stride_y;

7308

acc0 = fma(b0, (half8)a0.s2, acc0);

7309

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7310

acc1 = fma(b0, (half8)a1.s2, acc1);

7311

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7312

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7313

acc2 = fma(b0, (half8)a2.s2, acc2);

7314

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7315

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7316

acc3 = fma(b0, (half8)a3.s2, acc3);

7317

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7318

7319

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7320

src_addr.s1 += src1_stride_y;

7321

acc0 = fma(b0, (half8)a0.s3, acc0);

7322

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7323

acc1 = fma(b0, (half8)a1.s3, acc1);

7324

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7325

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7326

acc2 = fma(b0, (half8)a2.s3, acc2);

7327

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7328

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7329

acc3 = fma(b0, (half8)a3.s3, acc3);

7330

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7331

7332

src_addr.s0 += 4 * sizeof(half);

7333

}

7334

7335

for(; i < (int)COLS_A; ++i)

7336

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7337

#if defined(REINTERPRET_INPUT_AS_3D)

7338

// Load values from matrix A

7339

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7340

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7341

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7342

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7343

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7344

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7345

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7346

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7347

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7348

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7349

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7350

// Load values from matrix A

7351

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7352

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7353

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7354

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7355

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7356

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7357

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7358

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7359

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7360

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7361

#endif // defined(REINTERPRET_INPUT_AS_3D)

7362

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7363

// Load values from matrix B

7364

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7365

7366

src_addr += (int2)(sizeof(half), src1_stride_y);

7367

7368

// Accumulate

7369

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

7370

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7371

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

7372

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7373

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7374

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

7375

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7376

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7377

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

7378

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7379

}

7380

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7381

int z = get_global_id(2);

7382

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7383

// Compute destination address

7384

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7385

7386

// Compute dst address

7387

__global uchar *dst_addr = offset(&dst, 0, 0);

7388

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7389

uint4 zout = 0;

7390

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7391

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7392

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7393

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7394

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7395

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

7400

// |******************|

7401

// | cross_plane_pad |

7402

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7407

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7408

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7409

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7410

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7411

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7412

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7413

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7414

7415

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7416

// multiply dst_stride_z by DEPTH_GEMM3D

7417

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7418

#else // defined(REINTERPRET_OUTPUT_AS_3D)

7419

// Add offset for batched GEMM

7420

dst_addr += z * dst_stride_z;

7421

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

7422

7423

// Multiply by the weight of matrix-matrix product and store the result

7424

#if defined(ALPHA)

7425

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

7426

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7431

7432

#if defined(BROADCAST_BIAS)

7433

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7434

7435

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7436

7437

#ifndef UNIT_BETA

7438

SCALE_BLOCK(1, half, bias, BETA);

7439

#endif // UNIT_BIAS

7440

7441

// acc = acc + bias[broadcasted]

7442

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

7443

7444

#else // defined(BROADCAST_BIAS)

7445

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7446

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7447

7448

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7449

7450

#ifndef UNIT_BETA

7451

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

7456

7457

#endif // defined(BROADCAST_BIAS)

7458

#endif // defined(BETA)

7459

7460

#if defined(ACTIVATION_TYPE)

7461

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

7462

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7463

7464

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7465

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7466

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

7467

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7468

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

7469

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7470

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7471

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7472

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7473

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7474

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7475

*

7476

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

7477

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7478

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7479

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7480

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7481

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7482

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7483

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7484

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7485

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7486

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7487

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7488

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7489

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7490

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7491

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7492

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7493

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

7494

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7495

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7496

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7497

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7498

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7499

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7500

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7501

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

7502

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7503

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7504

float4 c = vload4(0, (__global float *)src.ptr);

7505

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7506

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7507

float4 out = alpha_ab + (float4)BETA * c;

7508

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7509

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7510

vstore4(out, 0, (__global float *)dst.ptr);

7511

}

7512

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7513

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7514

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7515

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7516

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7517

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7518

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

7519

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7520

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7521

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7522

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7523

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7524

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7525

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7526

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7527

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7528

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7529

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7530

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7531

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7532

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7533

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7534

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7535

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

7536

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7537

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7538

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7539

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7540

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7541

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7542

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7543

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

7544

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7545

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7546

half8 c = vload8(0, (__global half *)src.ptr);

7547

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7548

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7549

half8 out = alpha_ab + (half8)BETA * c;

7550

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7551

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7552

vstore8(out, 0, (__global half *)dst.ptr);

7553

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7554

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7555

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7556

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7557

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7558

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

7559

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7560

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7561

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7562

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7563

*

7564

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

7565

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7566

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7567

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7568

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7569

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7570

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7571

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7572

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7573

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7574

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7575

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

7576

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

7577

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7578

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7579

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7580

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7581

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7582

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7583

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7584

*/

7585

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

7586

TENSOR3D_DECLARATION(src1),

7587

IMAGE_DECLARATION(dst))

7588

{

7589

int idx = get_global_id(0) * 4;

7590

int idy = get_global_id(1);

7591

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7592

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7593

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

7594

src_addr.s1 += idx * sizeof(float);

7595

7596

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

7600

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7601

{

7602

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

7603

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7604

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

7605

7606

acc += b0 * (float4)a0.s0;

7607

acc += b1 * (float4)a0.s1;

7608

}

7609

7610

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

7611

{

7612

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

7613

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7614

7615

acc += b0 * (float4)a0;

7616

}

7617

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7618

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7619

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7620

7621

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

7622

}

Anton Lokhmotov