Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2019-07-19 09:54:47 +0100

[diff] [blame]

49

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

50

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

51

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

52

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

53

* @note Only the following values for M0, K0 and V0 are supported:

54

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

55

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

56

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

57

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

58

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

59

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

60

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

61

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

62

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

63

*

64

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

65

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

66

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

67

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

68

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

69

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

70

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

71

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

72

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

73

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

74

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

75

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

76

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

77

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

78

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

79

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

80

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

81

*/

82

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

83

TENSOR3D_DECLARATION(dst)

84

#if defined(REINTERPRET_INPUT_AS_3D)

85

,

86

uint cross_plane_pad

87

#endif // REINTERPRET_INPUT_AS_3D

88

)

89

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

90

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

91

#define BLOCK_SIZE ((M0) * (K0))

92

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

93

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

#if defined(INTERLEAVE)

95

#define OUTPUT_OFFSET_X (K0)

96

#else // defined(INTERLEAVE)

97

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

98

#endif // defined(INTERLEAVE)

99

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

100

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

101

#if defined(INTERLEAVE)

102

#define OUTPUT_STEP_X (K0) * (V0)

103

#else // Do not interleave

104

#define OUTPUT_STEP_X (K0)

105

#endif // defined(INTERLEAVE)

106

107

// Compute source and destination addresses

108

uint x = get_global_id(0);

109

uint y = get_global_id(1);

110

uint z = get_global_id(2);

111

112

// ------------------ Compute input/output addresses ---------------------------

113

114

// Compute the input address

115

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

116

117

// Compute the output address

118

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

119

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

120

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

121

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

122

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

124

#if defined(REINTERPRET_INPUT_AS_3D)

125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

126

// multiply src_stride_z by DEPTH_GEMM3D

127

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

128

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

129

130

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

131

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

132

133

#else // defined(REINTERPRET_INPUT_AS_3D)

134

135

input_ptr += z * (uint)src_stride_z;

136

137

#endif // defined(REINTERPRET_INPUT_AS_3D)

138

139

// Add offset for batched GEMM

140

output_ptr += z * (uint)dst_stride_z;

141

142

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

143

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

144

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

145

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

146

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

147

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

148

#endif // M0 > 1

149

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

150

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

151

#endif // M0 > 2

152

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

153

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

154

#endif // M0 > 3

155

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

156

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

#endif // M0 > 4

158

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

159

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

160

#endif // M0 > 5

161

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

162

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

163

#endif // M0 > 6

164

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

165

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

166

#endif // M0 > 7

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

167

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

168

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

169

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

170

171

#undef BLOCK_SIZE

172

#undef OUTPUT_OFFSET_X

173

#undef OUTPUT_STEP_X

174

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

175

176

#if M0 == 2

177

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

178

({ \

179

VEC_DATA_TYPE(DATA_TYPE, M0) \

180

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

181

VSTORE(M0) \

182

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

183

})

184

#elif M0 == 3 // M0 == 3

185

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

186

({ \

187

VEC_DATA_TYPE(DATA_TYPE, M0) \

188

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

189

VSTORE(M0) \

190

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

191

})

192

#elif M0 == 4 // M0 == 4

193

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

194

({ \

195

VEC_DATA_TYPE(DATA_TYPE, M0) \

196

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

197

VSTORE(M0) \

198

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

199

})

200

#elif M0 == 5 // M0 == 5

201

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

202

({ \

203

VEC_DATA_TYPE(DATA_TYPE, 4) \

204

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

205

DATA_TYPE res1 = a4.s##i; \

206

VSTORE(4) \

207

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

208

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

209

})

210

#elif M0 == 6 // M0 == 6

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

VEC_DATA_TYPE(DATA_TYPE, 2) \

216

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

217

VSTORE(4) \

218

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

219

VSTORE(2) \

220

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

221

})

222

#elif M0 == 7 // M0 == 7

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 3) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(3) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 8 // M0 == 8

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, M0) \

238

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

239

VSTORE(M0) \

240

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

})

242

#else // M0 not supported

243

#error "M0 value not supported"

244

#endif // N0 conditions

245

246

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

247

* the output matrix unrolling the values.

248

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

249

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

250

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

251

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

252

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

253

* @note Only the following values for M0, K0 and V0 are supported:

254

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

255

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

256

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

257

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

258

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

259

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

260

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

261

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

262

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

263

*

264

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

265

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

272

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

273

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

278

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

280

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

281

*/

282

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

283

TENSOR3D_DECLARATION(dst)

284

#if defined(REINTERPRET_INPUT_AS_3D)

285

,

286

uint cross_plane_pad

287

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

292

293

// Output offset X

294

#if defined(INTERLEAVE)

295

#define OUTPUT_OFFSET_X (M0)

296

#else // defined(INTERLEAVE)

297

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

298

#endif // defined(INTERLEAVE)

299

300

// Output step X

301

#if defined(INTERLEAVE)

302

#define OUTPUT_STEP_X (M0) * (V0)

303

#else // Do not interleave

304

#define OUTPUT_STEP_X (M0)

305

#endif // defined(INTERLEAVE)

306

307

// Compute source and destination addresses

308

uint x = get_global_id(0);

309

uint y = get_global_id(1);

310

uint z = get_global_id(2);

311

312

// ------------------ Compute input/output addresses ---------------------------

313

314

// Compute the input address

315

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

316

317

// Compute the output address

318

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

319

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

320

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

321

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

322

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

323

324

#if defined(REINTERPRET_INPUT_AS_3D)

325

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

326

// multiply src_stride_z by DEPTH_GEMM3D

327

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

328

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

329

330

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

331

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

332

333

#else // defined(REINTERPRET_INPUT_AS_3D)

334

335

input_ptr += z * (uint)src_stride_z;

336

337

#endif // defined(REINTERPRET_INPUT_AS_3D)

338

339

// Add offset for batched GEMM

340

output_ptr += z * (uint)dst_stride_z;

341

342

// ---------------------------Load input values --------------------------------

343

344

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

345

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

346

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

347

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

348

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

349

#endif // M0 > 1

350

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

351

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

352

#endif // M0 > 2

353

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

354

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

355

#endif // M0 > 3

356

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

357

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

358

#endif // M0 > 4

359

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

360

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

361

#endif // M0 > 5

362

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

363

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

364

#endif // M0 > 6

365

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

366

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

367

#endif // M0 > 7

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

368

// ---------------------------Transpose and store block -----------------------

369

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

372

#if K0 > 2

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

374

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

375

#if K0 > 3

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

377

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

378

#if K0 > 4

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

383

#endif // K0 > 4

384

#if K0 > 8

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

387

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

388

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

389

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

390

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

391

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

392

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

397

#undef OUTPUT_STEP_X

398

}

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

399

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

400

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

402

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

403

* the output matrix unrolling the values.

404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

405

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

407

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

408

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

409

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

410

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

411

* N0: 2,3,4,8,16

412

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

413

* H0: greater than 0

414

*

415

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

416

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

423

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

424

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

429

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

431

*/

432

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

433

TENSOR3D_DECLARATION(dst))

434

{

435

// Block size

436

#define BLOCK_SIZE ((K0) * (N0))

437

438

// Output offset X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_OFFSET_X (N0)

441

#else // defined(INTERLEAVE)

442

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

443

#endif // defined(INTERLEAVE)

444

445

// Output step X

446

#if defined(INTERLEAVE)

447

#define OUTPUT_STEP_X (N0) * (H0)

448

#else // Do not interleave

449

#define OUTPUT_STEP_X (N0)

450

#endif // defined(INTERLEAVE)

451

452

// Compute source and destination addresses

453

uint x = get_global_id(0);

454

uint y = get_global_id(1);

455

uint z = get_global_id(2);

456

457

// ------------------ Compute input/output addresses ---------------------------

458

459

// Compute the input address

460

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

461

462

// Compute the output address

463

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

464

x / (uint)H0)

465

* (uint)dst_stride_y)

466

+ z * (uint)dst_stride_z;

467

468

// ---------------------------Load input values --------------------------------

469

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

470

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

471

472

// Load values from the RHS matrix

473

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

474

#if K0 > 1

475

if(y * (uint)K0 + 1 < SRC_HEIGHT)

476

{

477

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

482

{

483

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

484

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

485

#endif // K0 > 2

486

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

487

if(y * (uint)K0 + 3 < SRC_HEIGHT)

488

{

489

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

490

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

491

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

492

#if K0 > 4

493

if(y * (uint)K0 + 4 < SRC_HEIGHT)

494

{

495

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

496

}

497

if(y * (uint)K0 + 5 < SRC_HEIGHT)

498

{

499

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

500

}

501

if(y * (uint)K0 + 6 < SRC_HEIGHT)

502

{

503

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

504

}

505

if(y * (uint)K0 + 7 < SRC_HEIGHT)

506

{

507

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

508

}

509

#endif // K0 > 4

510

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

511

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

512

{

513

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

514

}

515

if(y * (uint)K0 + 9 < SRC_HEIGHT)

516

{

517

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

518

}

519

if(y * (uint)K0 + 10 < SRC_HEIGHT)

520

{

521

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

522

}

523

if(y * (uint)K0 + 11 < SRC_HEIGHT)

524

{

525

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

526

}

527

if(y * (uint)K0 + 12 < SRC_HEIGHT)

528

{

529

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

530

}

531

if(y * (uint)K0 + 13 < SRC_HEIGHT)

532

{

533

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

534

}

535

if(y * (uint)K0 + 14 < SRC_HEIGHT)

536

{

537

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

538

}

539

if(y * (uint)K0 + 15 < SRC_HEIGHT)

540

{

541

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

546

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

547

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

548

549

#undef BLOCK_SIZE

550

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

555

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

556

* the output matrix unrolling the values.

557

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

558

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

559

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

560

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

561

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

563

* @note The option -DTRANSPOSE must passed at compile time.

564

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

565

* N0: 2,3,4,8,16

566

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

567

* H0: greater than 0

568

*

569

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

570

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

571

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

573

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

577

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

582

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

583

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

584

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

585

*/

586

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

587

TENSOR3D_DECLARATION(dst))

588

{

589

// Block size

590

#define BLOCK_SIZE ((K0) * (N0))

591

592

// Output offset X

593

#if defined(INTERLEAVE)

594

#define OUTPUT_OFFSET_X (K0)

595

#else // defined(INTERLEAVE)

596

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

597

#endif // defined(INTERLEAVE)

598

599

// Output step X

600

#if defined(INTERLEAVE)

601

#define OUTPUT_STEP_X (K0) * (H0)

602

#else // Do not interleave

603

#define OUTPUT_STEP_X (K0)

604

#endif // defined(INTERLEAVE)

605

606

// Compute source and destination addresses

607

uint x = get_global_id(0);

608

uint y = get_global_id(1);

609

uint z = get_global_id(2);

610

611

// ------------------ Compute input/output addresses ---------------------------

612

613

// Compute the input address

614

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

615

616

// Compute the output address

617

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

618

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

619

620

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

621

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

623

// Load values from the RHS matrix

624

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

625

if(y * (uint)K0 + 1 < SRC_HEIGHT)

626

{

627

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

628

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

629

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

630

if(y * (uint)K0 + 2 < SRC_HEIGHT)

631

{

632

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

633

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

634

#endif // K0 > 2

635

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

636

if(y * (uint)K0 + 3 < SRC_HEIGHT)

637

{

638

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

639

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

640

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

641

#if K0 > 4

642

if(y * (uint)K0 + 4 < SRC_HEIGHT)

643

{

644

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

645

}

646

if(y * (uint)K0 + 5 < SRC_HEIGHT)

647

{

648

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

649

}

650

if(y * (uint)K0 + 6 < SRC_HEIGHT)

651

{

652

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

653

}

654

if(y * (uint)K0 + 7 < SRC_HEIGHT)

655

{

656

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

657

}

658

#endif // K0 > 4

659

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

660

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

661

{

662

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

663

}

664

if(y * (uint)K0 + 9 < SRC_HEIGHT)

665

{

666

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

667

}

668

if(y * (uint)K0 + 10 < SRC_HEIGHT)

669

{

670

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

671

}

672

if(y * (uint)K0 + 11 < SRC_HEIGHT)

673

{

674

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

675

}

676

if(y * (uint)K0 + 12 < SRC_HEIGHT)

677

{

678

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

679

}

680

if(y * (uint)K0 + 13 < SRC_HEIGHT)

681

{

682

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

683

}

684

if(y * (uint)K0 + 14 < SRC_HEIGHT)

685

{

686

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

687

}

688

if(y * (uint)K0 + 15 < SRC_HEIGHT)

689

{

690

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

695

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

696

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

697

#if K0 == 2

698

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

704

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

705

#if N0 > 2

706

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

707

#endif // N0 > 2

708

#if N0 > 3

709

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

710

#endif // N0 > 3

711

#if N0 > 4

712

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

713

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

714

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

715

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

716

#endif // N0 > 4

717

#if N0 > 8

718

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

719

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

720

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

721

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

722

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

723

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

724

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

725

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

726

#endif // N0 > 8

727

728

#elif K0 == 3 // K0 == 2

729

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

734

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

735

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

736

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

737

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

738

#endif // N0 > 2

739

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

740

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

741

#endif // N0 > 3

742

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

743

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

744

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

745

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

746

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

747

#endif // N0 > 4

748

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

749

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

750

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

751

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

752

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

753

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

754

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

755

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

756

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

757

#endif // N0 > 8

758

759

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

760

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

766

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

767

#if N0 > 2

768

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

769

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

770

#if N0 > 3

771

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

772

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

773

#if N0 > 4

774

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

775

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

776

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

777

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

778

#endif // N0 > 4

779

#if N0 > 8

780

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

781

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

782

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

783

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

784

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

785

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

786

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

787

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

788

#endif // N0 > 8

789

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

790

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

791

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

796

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

797

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

798

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

799

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

800

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

801

#if N0 > 3

802

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

803

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

804

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

805

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

806

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

807

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

808

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

809

#endif // N0 > 4

810

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

811

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

812

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

813

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

814

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

815

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

816

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

817

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

818

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

819

#endif // N0 > 8

820

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

821

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

822

823

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

829

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

830

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

831

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

832

#if N0 > 2

833

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

834

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

835

#endif // N0 > 2

836

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

837

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

838

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

839

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

840

#if N0 > 4

841

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

842

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

843

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

844

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

845

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

846

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

847

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

848

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

849

#endif // N0 > 4

850

#if N0 > 8

851

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

852

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

853

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

854

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

855

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

856

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

857

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

858

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

859

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

860

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

861

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

862

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

863

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

864

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

865

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

866

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

871

#endif // N0 > 2

872

873

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

874

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

875

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

876

877

#undef BLOCK_SIZE

878

#undef OUTPUT_OFFSET_X

879

#undef OUTPUT_STEP_X

880

}

881

#endif // defined(TRANSPOSE)

882

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

883

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

884

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

885

886

#define CONCAT(a, b) a##b

887

888

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

893

({ \

894

c = fma(a.s0, b.s0, c); \

895

c = fma(a.s1, b.s1, c); \

896

})

897

#define ARM_DOT3(a, b, c) \

898

({ \

899

ARM_DOT2(a, b, c); \

900

c = fma((a.s2), (b.s2), c); \

901

})

902

#define ARM_DOT4(a, b, c) \

903

({ \

904

ARM_DOT3(a, b, c); \

905

c = fma((a.s3), (b.s3), c); \

906

})

907

#define ARM_DOT8(a, b, c) \

908

({ \

909

ARM_DOT4((a.lo), (b.lo), c); \

910

ARM_DOT4((a.hi), (b.hi), c); \

911

})

912

#define ARM_DOT16(a, b, c) \

913

({ \

914

ARM_DOT8((a.lo), (b.lo), c); \

915

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

})

926

#elif N0 == 3 // N0 == 3

927

#define ARM_DOT_K0XN0(k0, a, b, c) \

928

({ \

929

CONCAT(ARM_DOT, k0) \

930

((a), (b##0), (c.s0)); \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##1), (c.s1)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##2), (c.s2)); \

935

})

936

#elif N0 == 4 // N0 == 4

937

#define ARM_DOT_K0XN0(k0, a, b, c) \

938

({ \

939

CONCAT(ARM_DOT, k0) \

940

((a), (b##0), (c.s0)); \

941

CONCAT(ARM_DOT, k0) \

942

((a), (b##1), (c.s1)); \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##2), (c.s2)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##3), (c.s3)); \

947

})

948

#elif N0 == 8 // N0 == 8

949

#define ARM_DOT_K0XN0(k0, a, b, c) \

950

({ \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##0), (c.s0)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##1), (c.s1)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##2), (c.s2)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##3), (c.s3)); \

959

CONCAT(ARM_DOT, k0) \

960

((a), (b##4), (c.s4)); \

961

CONCAT(ARM_DOT, k0) \

962

((a), (b##5), (c.s5)); \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##6), (c.s6)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##7), (c.s7)); \

967

})

968

#elif N0 == 16 // N0 == 16

969

#define ARM_DOT_K0XN0(k0, a, b, c) \

970

({ \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##0), (c.s0)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##1), (c.s1)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##2), (c.s2)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##3), (c.s3)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##4), (c.s4)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##5), (c.s5)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##6), (c.s6)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##7), (c.s7)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##8), (c.s8)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##9), (c.s9)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##A), (c.sA)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##B), (c.sB)); \

995

CONCAT(ARM_DOT, k0) \

996

((a), (b##C), (c.sC)); \

997

CONCAT(ARM_DOT, k0) \

998

((a), (b##D), (c.sD)); \

999

CONCAT(ARM_DOT, k0) \

1000

((a), (b##E), (c.sE)); \

1001

CONCAT(ARM_DOT, k0) \

1002

((a), (b##F), (c.sF)); \

1003

})

1004

#else // N0 not supported

1005

#error "N0 value not supported"

1006

#endif // N0 conditions

1007

1008

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1009

* The LHS matrix is NOT reshaped

1010

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1011

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1012

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1013

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1014

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1015

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1017

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1019

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1020

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1021

* - N0 = 2, 3, 4, 8, 16

1022

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1023

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1024

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1025

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1026

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1027

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1028

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1029

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1030

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1031

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1032

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1033

*

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1034

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1035

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1036

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1037

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1038

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1039

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1040

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1042

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1044

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1046

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1047

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1048

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1050

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1052

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1053

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1054

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1055

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1056

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1057

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1058

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1059

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1060

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1062

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1063

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

*/

1065

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1066

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

IMAGE_DECLARATION(bias),

1069

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

IMAGE_DECLARATION(dst),

1071

uint lhs_stride_z,

1072

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1073

#if defined(BETA)

1074

uint bias_stride_z,

1075

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1076

uint dst_stride_z

1077

#if defined(REINTERPRET_INPUT_AS_3D)

1078

,

1079

uint lhs_cross_plane_pad

1080

#endif // REINTERPRET_INPUT_AS_3D

1081

#if defined(REINTERPRET_OUTPUT_AS_3D)

1082

,

1083

uint dst_cross_plane_pad

1084

#endif // REINTERPRET_OUTPUT_AS_3D

1085

)

1086

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1087

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1088

#define RHS_BLOCK_SIZE ((K0) * (N0))

1089

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1090

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1091

#if defined(RHS_INTERLEAVE)

1092

#define RHS_OFFSET_X (K0)

1093

#define RHS_STEP_X ((K0) * (H0))

1094

#define RHS_STEP_LOOP (1)

1095

#else // defined(RHS_INTERLEAVE)

1096

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1097

#define RHS_STEP_X (K0)

1098

#define RHS_STEP_LOOP (H0)

1099

#endif // defined(RHS_INTERLEAVE)

1100

1101

uint x = get_global_id(0);

1102

uint y = get_global_id(1);

1103

uint z = get_global_id(2);

1104

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1105

#if defined(DUMMY_WORK_ITEMS)

1106

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1111

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1112

// Compute LHS matrix address

1113

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1114

1115

// Compute RHS matrix address

1116

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1117

1118

#if defined(MATRIX_B_DEPTH)

1119

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1120

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1121

#else // defined(MATRIX_B_DEPTH)

1122

rhs_offset += z * rhs_stride_z;

1123

#endif // defined(MATRIX_B_DEPTH)

1124

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1125

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1126

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1127

1128

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1129

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1130

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1131

1132

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1133

// multiply lhs_stride_z by DEPTH_GEMM3D

1134

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1135

1136

#else // defined(REINTERPRET_INPUT_AS_3D)

1137

1138

// Add offset for batched GEMM

1139

lhs_offset += z * lhs_stride_z;

1140

1141

#endif // defined(REINTERPRET_INPUT_AS_3D)

1142

1143

// Initialize the accumulators

1144

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1145

1146

int i = 0;

1147

for(; i <= (K - K0); i += K0)

1148

{

1149

// Supported cases (M0, K0):

1150

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1151

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1152

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1153

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1154

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1155

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1156

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1157

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1158

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1159

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1160

1161

// Load values from RHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1162

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1163

1164

// Accumulate

1165

ARM_DOT_K0XN0(K0, a0, b, c0);

1166

#if M0 > 1

1167

ARM_DOT_K0XN0(K0, a1, b, c1);

1168

#endif // M0 > 1

1169

#if M0 > 2

1170

ARM_DOT_K0XN0(K0, a2, b, c2);

1171

#endif // M0 > 2

1172

#if M0 > 3

1173

ARM_DOT_K0XN0(K0, a3, b, c3);

1174

#endif // M0 > 3

1175

#if M0 > 4

1176

ARM_DOT_K0XN0(K0, a4, b, c4);

1177

#endif // M0 > 4

1178

#if M0 > 5

1179

ARM_DOT_K0XN0(K0, a5, b, c5);

1180

#endif // M0 > 5

1181

#if M0 > 6

1182

ARM_DOT_K0XN0(K0, a6, b, c6);

1183

#endif // M0 > 6

1184

#if M0 > 7

1185

ARM_DOT_K0XN0(K0, a7, b, c7);

1186

#endif // M0 > 7

1187

1188

lhs_offset += K0 * sizeof(DATA_TYPE);

1189

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1190

}

1191

1192

// Left-over accumulations

1193

for(; i < K; ++i)

1194

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1195

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1196

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1197

1198

// Load values from RHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1199

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1200

1201

// Accumulate

1202

ARM_DOT_K0XN0(1, a0, b, c0);

1203

#if M0 > 1

1204

ARM_DOT_K0XN0(1, a1, b, c1);

1205

#endif // M0 > 1

1206

#if M0 > 2

1207

ARM_DOT_K0XN0(1, a2, b, c2);

1208

#endif // M0 > 2

1209

#if M0 > 3

1210

ARM_DOT_K0XN0(1, a3, b, c3);

1211

#endif // M0 > 3

1212

#if M0 > 4

1213

ARM_DOT_K0XN0(1, a4, b, c4);

1214

#endif // M0 > 4

1215

#if M0 > 5

1216

ARM_DOT_K0XN0(1, a5, b, c5);

1217

#endif // M0 > 5

1218

#if M0 > 6

1219

ARM_DOT_K0XN0(1, a6, b, c6);

1220

#endif // M0 > 6

1221

#if M0 > 7

1222

ARM_DOT_K0XN0(1, a7, b, c7);

1223

#endif // M0 > 7

1224

1225

lhs_offset += sizeof(DATA_TYPE);

1226

rhs_offset += sizeof(DATA_TYPE);

1227

}

1228

1229

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1230

1231

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1232

1233

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1234

1235

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1236

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1237

1238

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1239

// multiply dst_stride_z by DEPTH_GEMM3D

1240

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1241

1242

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1243

1244

// Add offset for batched GEMM

1245

dst_addr += z * dst_stride_z;

1246

1247

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1248

1249

// Multiply by the weight of matrix-matrix product and store the result

1250

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1251

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1252

#endif // defined(ALPHA)

1253

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1254

// Add beta*bias

1255

#if defined(BETA)

1256

#if defined(BROADCAST_BIAS)

1257

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1258

1259

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1260

1261

#ifndef UNIT_BETA

1262

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1263

#endif // UNIT_BIAS

1264

1265

// c = c + bias[broadcasted]

1266

ADD_BLOCK_BROADCAST(M0, c, bias0);

1267

1268

#else // defined(BROADCAST_BIAS)

1269

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1270

2) * bias_stride_z;

1271

1272

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1273

1274

#ifndef UNIT_BETA

1275

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1280

1281

#endif // defined(BROADCAST_BIAS)

1282

#endif // defined(BETA)

1283

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1284

#if defined(ACTIVATION_TYPE)

1285

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1286

#endif // defined(ACTIVATION_TYPE)

1287

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1288

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1289

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1290

1291

#undef RHS_BLOCK_SIZE

1292

#undef RHS_OFFSET_X

1293

#undef RHS_STEP_X

1294

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1295

1296

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1303

({ \

1304

VEC_DATA_TYPE(DATA_TYPE, N0) \

1305

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1306

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1307

})

1308

#elif M0 == 2 // M0 == 2

1309

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1310

({ \

1311

VEC_DATA_TYPE(DATA_TYPE, N0) \

1312

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1313

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1314

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1315

})

1316

#elif M0 == 3 // M0 == 3

1317

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1318

({ \

1319

VEC_DATA_TYPE(DATA_TYPE, N0) \

1320

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1321

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1322

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1323

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1324

})

1325

#elif M0 == 4 // M0 == 4

1326

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1327

({ \

1328

VEC_DATA_TYPE(DATA_TYPE, N0) \

1329

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1330

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1331

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1332

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1333

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1334

})

1335

#elif M0 == 5 // M0 == 5

1336

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1337

({ \

1338

VEC_DATA_TYPE(DATA_TYPE, N0) \

1339

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1340

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1341

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1342

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1343

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1344

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1345

})

1346

#elif M0 == 6 // M0 == 6

1347

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1348

({ \

1349

VEC_DATA_TYPE(DATA_TYPE, N0) \

1350

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1351

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1352

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1353

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1354

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1355

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1356

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1357

})

1358

#elif M0 == 7 // M0 == 7

1359

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1360

({ \

1361

VEC_DATA_TYPE(DATA_TYPE, N0) \

1362

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1363

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1364

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1365

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1366

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1367

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1368

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1369

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1370

})

1371

#elif M0 == 8 // M0 == 8

1372

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1373

({ \

1374

VEC_DATA_TYPE(DATA_TYPE, N0) \

1375

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1376

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1377

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1378

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1379

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1380

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1381

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1382

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1383

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

1384

})

1385

#else // M0 not supported

1386

#error "M0 not supported"

1387

#endif // M0 not supported

1388

1389

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1390

* The LHS matrix is NOT reshaped

1391

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1392

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1393

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1394

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1395

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1396

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1397

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1398

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1399

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1400

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1401

* - N0 = 2, 3, 4, 8, 16

1402

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1403

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1405

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1406

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1407

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1408

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1409

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1410

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1411

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1412

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1413

*

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1414

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1415

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1416

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1417

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1418

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1419

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1420

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1421

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1422

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1423

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1424

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1425

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1426

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1427

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1428

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1429

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1430

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1431

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1432

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1433

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1434

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1435

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1436

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1437

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1438

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1439

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1440

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1441

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1442

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1443

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1444

*/

1445

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1446

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1447

#if defined(BETA)

1448

IMAGE_DECLARATION(bias),

1449

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1450

IMAGE_DECLARATION(dst),

1451

uint lhs_stride_z,

1452

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1453

#if defined(BETA)

1454

uint bias_stride_z,

1455

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1456

uint dst_stride_z

1457

#if defined(REINTERPRET_INPUT_AS_3D)

1458

,

1459

uint lhs_cross_plane_pad

1460

#endif // REINTERPRET_INPUT_AS_3D

1461

#if defined(REINTERPRET_OUTPUT_AS_3D)

1462

,

1463

uint dst_cross_plane_pad

1464

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1469

1470

// RHS offset and step X

1471

#if defined(RHS_INTERLEAVE)

1472

#define RHS_OFFSET_X (N0)

1473

#define RHS_STEP_X ((N0) * (H0))

1474

#define RHS_STEP_LOOP (1)

1475

#else // defined(RHS_INTERLEAVE)

1476

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1477

#define RHS_STEP_X (N0)

1478

#define RHS_STEP_LOOP (H0)

1479

#endif // defined(RHS_INTERLEAVE)

1480

1481

uint x = get_global_id(0);

1482

uint y = get_global_id(1);

1483

uint z = get_global_id(2);

1484

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1485

#if defined(DUMMY_WORK_ITEMS)

1486

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1491

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1492

// Compute LHS matrix address

1493

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1494

1495

// Compute RHS matrix address

1496

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1497

1498

#if defined(MATRIX_B_DEPTH)

1499

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1500

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1501

#else // defined(MATRIX_B_DEPTH)

1502

rhs_offset += z * rhs_stride_z;

1503

#endif // defined(MATRIX_B_DEPTH)

1504

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1505

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1506

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1507

1508

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1509

1510

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1511

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1512

1513

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1514

// multiply lhs_stride_z by DEPTH_GEMM3D

1515

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1516

1517

#else // defined(REINTERPRET_INPUT_AS_3D)

1518

1519

// Add offset for batched GEMM

1520

lhs_offset += z * lhs_stride_z;

1521

1522

#endif // defined(REINTERPRET_INPUT_AS_3D)

1523

1524

// Initialize the accumulators

1525

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1526

1527

int i = 0;

1528

for(; i <= (K - K0); i += K0)

1529

{

1530

// Supported cases (M0, K0):

1531

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1532

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1533

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1534

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1535

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1536

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1537

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1538

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1539

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1540

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1541

1542

LD_RHS_VFMA_M0xN0(0, a, c);

1543

LD_RHS_VFMA_M0xN0(1, a, c);

1544

#if K0 > 2

1545

LD_RHS_VFMA_M0xN0(2, a, c);

1546

#endif // K0 > 2

1547

#if K0 > 3

1548

LD_RHS_VFMA_M0xN0(3, a, c);

1549

#endif // K0 > 3

1550

#if K0 > 4

1551

LD_RHS_VFMA_M0xN0(4, a, c);

1552

LD_RHS_VFMA_M0xN0(5, a, c);

1553

LD_RHS_VFMA_M0xN0(6, a, c);

1554

LD_RHS_VFMA_M0xN0(7, a, c);

1555

#endif // K0 > 4

1556

#if K0 > 8

1557

LD_RHS_VFMA_M0xN0(8, a, c);

1558

LD_RHS_VFMA_M0xN0(9, a, c);

1559

LD_RHS_VFMA_M0xN0(A, a, c);

1560

LD_RHS_VFMA_M0xN0(B, a, c);

1561

LD_RHS_VFMA_M0xN0(C, a, c);

1562

LD_RHS_VFMA_M0xN0(D, a, c);

1563

LD_RHS_VFMA_M0xN0(E, a, c);

1564

LD_RHS_VFMA_M0xN0(F, a, c);

1565

#endif // K0 > 8

1566

1567

lhs_offset += K0 * sizeof(DATA_TYPE);

1568

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1569

}

1570

1571

// Left-over accumulations

1572

for(; i < K; ++i)

1573

{

1574

// Load values from LHS matrix

1575

VEC_DATA_TYPE(DATA_TYPE, 2)

1576

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1577

#if M0 > 1

1578

VEC_DATA_TYPE(DATA_TYPE, 2)

1579

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1580

#endif // M0 > 1

1581

#if M0 > 2

1582

VEC_DATA_TYPE(DATA_TYPE, 2)

1583

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1584

#endif // M0 > 2

1585

#if M0 > 3

1586

VEC_DATA_TYPE(DATA_TYPE, 2)

1587

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1588

#endif // M0 > 3

1589

#if M0 > 4

1590

VEC_DATA_TYPE(DATA_TYPE, 2)

1591

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1592

#endif // M0 > 4

1593

#if M0 > 5

1594

VEC_DATA_TYPE(DATA_TYPE, 2)

1595

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1596

#endif // M0 > 5

1597

#if M0 > 6

1598

VEC_DATA_TYPE(DATA_TYPE, 2)

1599

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1600

#endif // M0 > 6

1601

#if M0 > 7

1602

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1603

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1604

#endif // M0 > 7

1605

1606

LD_RHS_VFMA_M0xN0(0, a, c);

1607

1608

lhs_offset += sizeof(DATA_TYPE);

1609

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1610

}

1611

1612

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1613

1614

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1615

1616

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1617

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1618

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1619

1620

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1621

// multiply dst_stride_z by DEPTH_GEMM3D

1622

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1623

1624

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1625

1626

// Add offset for batched GEMM

1627

dst_addr += z * dst_stride_z;

1628

1629

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1630

1631

// Multiply by the weight of matrix-matrix product and store the result

1632

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1633

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1634

#endif // defined(ALPHA)

1635

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1636

// Add beta*bias

1637

#if defined(BETA)

1638

#if defined(BROADCAST_BIAS)

1639

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1640

1641

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1642

1643

#ifndef UNIT_BETA

1644

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1645

#endif // UNIT_BIAS

1646

1647

// c = c + bias[broadcasted]

1648

ADD_BLOCK_BROADCAST(M0, c, bias0);

1649

1650

#else // defined(BROADCAST_BIAS)

1651

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1652

2) * bias_stride_z;

1653

1654

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1655

1656

#ifndef UNIT_BETA

1657

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1662

1663

#endif // defined(BROADCAST_BIAS)

1664

#endif // defined(BETA)

1665

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1666

#if defined(ACTIVATION_TYPE)

1667

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1668

#endif // defined(ACTIVATION_TYPE)

1669

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1670

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1671

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1672

1673

#undef RHS_BLOCK_SIZE

1674

#undef RHS_OFFSET_X

1675

#undef RHS_STEP_X

1676

}

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1677

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1678

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1679

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1680

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1681

#if defined(MIXED_PRECISION)

1682

#if K0 == 2

1683

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

1689

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

1696

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

1704

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

1716

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

1736

#error "K0 value not supported"

1737

#endif // K0 conditions

1738

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1739

#if K0 == 2

1740

#define ARM_DOT_K0(a, b, c) \

1741

({ \

1742

c = fma(a.s0, b.s0, c); \

1743

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1744

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1745

#elif K0 == 3 // K0 == 3

1746

#define ARM_DOT_K0(a, b, c) \

1747

({ \

1748

c = fma(a.s0, b.s0, c); \

1749

c = fma(a.s1, b.s1, c); \

1750

c = fma(a.s2, b.s2, c); \

1751

})

1752

#elif K0 == 4 // K0 == 4

1753

#define ARM_DOT_K0(a, b, c) \

1754

({ \

1755

c = fma(a.s0, b.s0, c); \

1756

c = fma(a.s1, b.s1, c); \

1757

c = fma(a.s2, b.s2, c); \

1758

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1759

})

1760

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1761

#define ARM_DOT_K0(a, b, c) \

1762

({ \

1763

c = fma(a.s0, b.s0, c); \

1764

c = fma(a.s1, b.s1, c); \

1765

c = fma(a.s2, b.s2, c); \

1766

c = fma(a.s3, b.s3, c); \

1767

c = fma(a.s4, b.s4, c); \

1768

c = fma(a.s5, b.s5, c); \

1769

c = fma(a.s6, b.s6, c); \

1770

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1771

})

1772

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1773

#define ARM_DOT_K0(a, b, c) \

1774

({ \

1775

c = fma(a.s0, b.s0, c); \

1776

c = fma(a.s1, b.s1, c); \

1777

c = fma(a.s2, b.s2, c); \

1778

c = fma(a.s3, b.s3, c); \

1779

c = fma(a.s4, b.s4, c); \

1780

c = fma(a.s5, b.s5, c); \

1781

c = fma(a.s6, b.s6, c); \

1782

c = fma(a.s7, b.s7, c); \

1783

c = fma(a.s8, b.s8, c); \

1784

c = fma(a.s9, b.s9, c); \

1785

c = fma(a.sA, b.sA, c); \

1786

c = fma(a.sB, b.sB, c); \

1787

c = fma(a.sC, b.sC, c); \

1788

c = fma(a.sD, b.sD, c); \

1789

c = fma(a.sE, b.sE, c); \

1790

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1791

})

1792

#else // K0 not supported

1793

#error "K0 value not supported"

1794

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1795

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1796

1797

#if N0 == 2

1798

#define ARM_DOT_K0XN0(a, b, c) \

1799

({ \

1800

ARM_DOT_K0((a), (b##0), (c.s0)); \

1801

ARM_DOT_K0((a), (b##1), (c.s1)); \

1802

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1803

#elif N0 == 3 // N0 == 3

1804

#define ARM_DOT_K0XN0(a, b, c) \

1805

({ \

1806

ARM_DOT_K0((a), (b##0), (c.s0)); \

1807

ARM_DOT_K0((a), (b##1), (c.s1)); \

1808

ARM_DOT_K0((a), (b##2), (c.s2)); \

1809

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1810

#elif N0 == 4 // N0 == 4

1811

#define ARM_DOT_K0XN0(a, b, c) \

1812

({ \

1813

ARM_DOT_K0((a), (b##0), (c.s0)); \

1814

ARM_DOT_K0((a), (b##1), (c.s1)); \

1815

ARM_DOT_K0((a), (b##2), (c.s2)); \

1816

ARM_DOT_K0((a), (b##3), (c.s3)); \

1817

})

1818

#elif N0 == 8 // N0 == 8

1819

#define ARM_DOT_K0XN0(a, b, c) \

1820

({ \

1821

ARM_DOT_K0((a), (b##0), (c.s0)); \

1822

ARM_DOT_K0((a), (b##1), (c.s1)); \

1823

ARM_DOT_K0((a), (b##2), (c.s2)); \

1824

ARM_DOT_K0((a), (b##3), (c.s3)); \

1825

ARM_DOT_K0((a), (b##4), (c.s4)); \

1826

ARM_DOT_K0((a), (b##5), (c.s5)); \

1827

ARM_DOT_K0((a), (b##6), (c.s6)); \

1828

ARM_DOT_K0((a), (b##7), (c.s7)); \

1829

})

1830

#elif N0 == 16 // N0 == 16

1831

#define ARM_DOT_K0XN0(a, b, c) \

1832

({ \

1833

ARM_DOT_K0((a), (b##0), (c.s0)); \

1834

ARM_DOT_K0((a), (b##1), (c.s1)); \

1835

ARM_DOT_K0((a), (b##2), (c.s2)); \

1836

ARM_DOT_K0((a), (b##3), (c.s3)); \

1837

ARM_DOT_K0((a), (b##4), (c.s4)); \

1838

ARM_DOT_K0((a), (b##5), (c.s5)); \

1839

ARM_DOT_K0((a), (b##6), (c.s6)); \

1840

ARM_DOT_K0((a), (b##7), (c.s7)); \

1841

ARM_DOT_K0((a), (b##8), (c.s8)); \

1842

ARM_DOT_K0((a), (b##9), (c.s9)); \

1843

ARM_DOT_K0((a), (b##A), (c.sA)); \

1844

ARM_DOT_K0((a), (b##B), (c.sB)); \

1845

ARM_DOT_K0((a), (b##C), (c.sC)); \

1846

ARM_DOT_K0((a), (b##D), (c.sD)); \

1847

ARM_DOT_K0((a), (b##E), (c.sE)); \

1848

ARM_DOT_K0((a), (b##F), (c.sF)); \

1849

})

1850

#else // N0 not supported

1851

#error "N0 value not supported"

1852

#endif // N0 conditions

1853

1854

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1855

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1856

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1857

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1858

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1859

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

1860

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1861

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1862

* @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).

1863

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

1864

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

1865

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1866

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

1867

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1868

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

1869

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1870

* - N0 = 2, 3, 4, 8, 16

1871

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1872

* - V0 >= 1

1873

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1874

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1875

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1876

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1877

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1878

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1879

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1880

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1881

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

1882

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1883

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1884

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1885

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1886

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1887

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1888

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1889

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1890

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1891

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1892

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1893

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1894

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1895

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1896

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1897

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1898

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1899

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1900

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1901

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1902

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1903

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1904

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1905

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1906

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1907

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

1908

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1909

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1910

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1911

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1912

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1913

*/

1914

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

1915

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1916

#if defined(BETA)

1917

IMAGE_DECLARATION(bias),

1918

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1919

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1920

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1921

uint lhs_stride_z,

1922

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1923

#if defined(BETA)

1924

uint bias_stride_z,

1925

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1926

uint dst_stride_z

1927

#if defined(REINTERPRET_OUTPUT_AS_3D)

1928

,

1929

uint dst_cross_plane_pad

1930

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

1935

1936

#if defined(LHS_INTERLEAVE)

1937

#define LHS_OFFSET_X (K0)

1938

#define LHS_STEP_X ((K0) * (V0))

1939

#define LHS_STEP_LOOP (1)

1940

#else // defined(INTERLEAVE)

1941

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

1942

#define LHS_STEP_X (K0)

1943

#define LHS_STEP_LOOP (V0)

1944

#endif // defined(INTERLEAVE)

1945

1946

// Block size

1947

#define RHS_BLOCK_SIZE ((K0) * (N0))

1948

1949

// RHS offset and step X

1950

#if defined(RHS_INTERLEAVE)

1951

#define RHS_OFFSET_X (K0)

1952

#define RHS_STEP_X ((K0) * (H0))

1953

#define RHS_STEP_LOOP (1)

1954

#else // defined(RHS_INTERLEAVE)

1955

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1956

#define RHS_STEP_X (K0)

1957

#define RHS_STEP_LOOP (H0)

1958

#endif // defined(RHS_INTERLEAVE)

1959

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1960

#if defined(DUMMY_WORK_ITEMS)

1961

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1966

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1967

// Compute LHS matrix address

1968

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

1969

(get_global_id(2) * lhs_stride_z);

1970

1971

// Compute RHS matrix address

1972

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

1973

1974

#if defined(MATRIX_B_DEPTH)

1975

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1976

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

1977

#else // defined(MATRIX_B_DEPTH)

1978

rhs_addr += get_global_id(2) * rhs_stride_z;

1979

#endif // defined(MATRIX_B_DEPTH)

1980

1981

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1982

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1983

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1984

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

1985

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1986

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1987

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1988

{

1989

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1990

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1991

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1992

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1993

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1994

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1995

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1996

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1997

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1998

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1999

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2000

2001

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2002

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2003

2004

// Accumulate

2005

ARM_DOT_K0XN0(a0, b, c0);

2006

#if M0 > 1

2007

ARM_DOT_K0XN0(a1, b, c1);

2008

#endif // M0 > 1

2009

#if M0 > 2

2010

ARM_DOT_K0XN0(a2, b, c2);

2011

#endif // M0 > 2

2012

#if M0 > 3

2013

ARM_DOT_K0XN0(a3, b, c3);

2014

#endif // M0 > 3

2015

#if M0 > 4

2016

ARM_DOT_K0XN0(a4, b, c4);

2017

#endif // M0 > 4

2018

#if M0 > 5

2019

ARM_DOT_K0XN0(a5, b, c5);

2020

#endif // M0 > 5

2021

#if M0 > 6

2022

ARM_DOT_K0XN0(a6, b, c6);

2023

#endif // M0 > 6

2024

#if M0 > 7

2025

ARM_DOT_K0XN0(a7, b, c7);

2026

#endif // M0 > 7

2027

2028

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2029

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2030

}

2031

2032

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2033

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2034

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2035

2036

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2037

2038

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2039

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2040

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2041

// multiply dst_stride_z by DEPTH_GEMM3D

2042

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2043

2044

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2045

2046

// Add offset for batched GEMM

2047

dst_addr += get_global_id(2) * dst_stride_z;

2048

2049

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2050

2051

// Multiply by the weight of matrix-matrix product and store the result

2052

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2053

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2054

#endif // defined(ALPHA)

2055

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2056

// Add beta*bias

2057

#if defined(BETA)

2058

#if defined(BROADCAST_BIAS)

2059

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2060

2061

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2062

2063

#ifndef UNIT_BETA

2064

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2065

#endif // UNIT_BIAS

2066

2067

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2068

#if defined(MIXED_PRECISION)

2069

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2070

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2071

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2072

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2073

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2074

2075

#else // defined(BROADCAST_BIAS)

2076

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2077

2) * bias_stride_z;

2078

2079

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2080

2081

#ifndef UNIT_BETA

2082

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2083

#endif // UNIT_BIAS

2084

2085

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2086

#if defined(MIXED_PRECISION)

2087

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2088

ADD_BLOCK(M0, c, bias_hp);

2089

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2090

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2091

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2092

2093

#endif // defined(BROADCAST_BIAS)

2094

#endif // defined(BETA)

2095

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2096

#if defined(ACTIVATION_TYPE)

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2097

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2098

#endif // defined(ACTIVATION_TYPE)

2099

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2100

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2101

#if defined(MIXED_PRECISION)

2102

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2103

#else // defined(MIXED_PRECISION)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2104

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2105

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2106

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2107

#undef LHS_BLOCK_SIZE

2108

#undef LHS_OFFSET_X

2109

#undef LHS_STEP_X

2110

#undef RHS_BLOCK_SIZE

2111

#undef RHS_OFFSET_X

2112

#undef RHS_STEP_X

2113

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2114

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2115

#if defined(LHS_TRANSPOSE)

2116

2117

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

2118

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2119

#if defined(MIXED_PRECISION)

2120

2121

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

2122

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2123

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2124

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2125

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

2126

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2127

#else // defined(MIXED_PRECISION

2128

2129

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

2130

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

2131

#else // GPU_ARCH == GPU_ARCH_MIDGARD

2132

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

2133

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

2134

2135

#endif // defined(MIXED_PRECISION)

2136

2137

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

2138

({ \

2139

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2140

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2141

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

2142

({ \

2143

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

2144

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2145

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2146

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

2147

({ \

2148

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

2149

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2150

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2151

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

2152

({ \

2153

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

2154

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2155

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2156

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

2157

({ \

2158

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

2159

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

2160

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

2161

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

2162

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2163

})

2164

2165

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

2166

// a is the column-vector (transposed)

2167

// b is the row-vector (not transposed)

2168

// C is the output matrix

2169

// Lower case is a vector (a, b)

2170

// Upper case is a matrix (C)

2171

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

2172

2173

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

2174

({ \

2175

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

2176

})

2177

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

2178

({ \

2179

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

2180

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

2181

})

2182

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

2183

({ \

2184

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

2185

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

2186

})

2187

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

2188

({ \

2189

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

2190

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

2191

})

2192

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

2193

({ \

2194

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

2195

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

2196

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

2197

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

2198

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

2199

})

2200

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

2201

({ \

2202

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

2203

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

2204

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

2205

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

2206

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

2207

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

2208

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

2209

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

2210

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

2211

})

2212

2213

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

2214

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

2215

// The dimensions supported are:

2216

// M0: 1, 2, 3, 4, 8

2217

// N0: 1, 2, 3, 4, 8, 16

2218

// K0: 1, 2, 3, 4, 8, 16

2219

// This macro calls the vector-by-matrix macro K0 times

2220

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2221

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

2222

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2223

(M0, N0, TYPE, A, B, C)

2224

2225

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2226

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

2227

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

2228

*

2229

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

2230

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2231

* @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).

2232

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2233

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2234

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2235

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2236

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2237

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2238

* - M0 = 2, 3, 4, 8

2239

* - N0 = 2, 3, 4, 8, 16

2240

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2245

* The activation function is performed after the bias addition

2246

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2247

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2248

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2249

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2250

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2251

*

2252

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2253

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2254

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2255

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2256

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2257

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2258

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2259

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2260

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2261

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2262

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2263

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2264

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2265

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2266

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2267

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2268

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2269

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2270

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2271

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2272

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2273

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2274

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2275

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2276

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

2277

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2278

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2279

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2280

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2281

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2282

*/

2283

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

2284

IMAGE_DECLARATION(rhs),

2285

#if defined(BETA)

2286

IMAGE_DECLARATION(bias),

2287

#endif // defined(BETA)

2288

IMAGE_DECLARATION(dst),

uint k,

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2295

uint dst_stride_z

2296

#if defined(REINTERPRET_OUTPUT_AS_3D)

2297

,

2298

uint dst_cross_plane_pad

2299

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2304

2305

#if defined(LHS_INTERLEAVE)

2306

#define LHS_OFFSET_X (M0)

2307

#define LHS_STEP_X ((M0) * (V0))

2308

#define LHS_STEP_LOOP (1)

2309

#else // defined(INTERLEAVE)

2310

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2311

#define LHS_STEP_X (M0)

2312

#define LHS_STEP_LOOP (V0)

2313

#endif // defined(INTERLEAVE)

2314

2315

// Block size

2316

#define RHS_BLOCK_SIZE ((K0) * (N0))

2317

2318

// RHS offset and step X

2319

#if defined(RHS_INTERLEAVE)

2320

#define RHS_OFFSET_X (N0)

2321

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2322

#else // defined(RHS_INTERLEAVE)

2323

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2324

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2325

#endif // defined(RHS_INTERLEAVE)

2326

2327

const uint x = get_global_id(0);

2328

const uint y = get_global_id(1);

2329

const uint z = get_global_id(2);

2330

2331

#if defined(DUMMY_WORK_ITEMS)

2332

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2337

2338

// Compute LHS matrix address

2339

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

2340

2341

// Compute RHS matrix address

2342

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

2343

2344

#if defined(MATRIX_B_DEPTH)

2345

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2346

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

2347

#else // defined(MATRIX_B_DEPTH)

2348

rhs_addr += z * rhs_stride_z;

2349

#endif // defined(MATRIX_B_DEPTH)

2350

2351

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2352

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2353

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2354

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

2355

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2356

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

2357

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

2358

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2359

for(int i = 0; i < k; i += K0)

2360

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2361

VEC_DATA_TYPE(DATA_TYPE, M0)

2362

a0 = VLOAD(M0)(0, lhs);

2363

VEC_DATA_TYPE(DATA_TYPE, N0)

2364

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2365

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2366

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2367

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2368

lhs += LHS_STEP_X;

2369

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2370

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2371

#if K0 > 1

2372

a0 = VLOAD(M0)(0, lhs);

2373

b0 = VLOAD(N0)(0, rhs);

2374

2375

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

2383

b0 = VLOAD(N0)(0, rhs);

2384

2385

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

2393

b0 = VLOAD(N0)(0, rhs);

2394

2395

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

2403

b0 = VLOAD(N0)(0, rhs);

2404

2405

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2411

b0 = VLOAD(N0)(0, rhs);

2412

2413

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2419

b0 = VLOAD(N0)(0, rhs);

2420

2421

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2427

b0 = VLOAD(N0)(0, rhs);

2428

2429

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

2437

b0 = VLOAD(N0)(0, rhs);

2438

2439

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2445

b0 = VLOAD(N0)(0, rhs);

2446

2447

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2453

b0 = VLOAD(N0)(0, rhs);

2454

2455

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2461

b0 = VLOAD(N0)(0, rhs);

2462

2463

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2469

b0 = VLOAD(N0)(0, rhs);

2470

2471

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2477

b0 = VLOAD(N0)(0, rhs);

2478

2479

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2485

b0 = VLOAD(N0)(0, rhs);

2486

2487

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2493

b0 = VLOAD(N0)(0, rhs);

2494

2495

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

2502

lhs += (M0 * K0 * (V0 - 1));

2503

#endif // LHS_INTERLEAVE

2504

2505

#ifndef RHS_INTERLEAVE

2506

rhs += (N0 * K0 * (H0 - 1));

2507

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2508

}

2509

2510

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2511

2512

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2513

2514

#if defined(REINTERPRET_OUTPUT_AS_3D)

2515

2516

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2517

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2518

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2519

// multiply dst_stride_z by DEPTH_GEMM3D

2520

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2521

2522

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2523

2524

// Add offset for batched GEMM

2525

dst_addr += z * dst_stride_z;

2526

2527

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2528

2529

// Multiply by the weight of matrix-matrix product and store the result

2530

#if defined(ALPHA)

2531

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2532

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2537

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

2538

2539

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2540

2541

#ifndef UNIT_BETA

2542

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2543

#endif // UNIT_BIAS

2544

2545

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2546

#if defined(MIXED_PRECISION)

2547

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2548

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2549

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2550

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2551

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2552

2553

#else // defined(BROADCAST_BIAS)

2554

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

2555

2556

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2557

2558

#ifndef UNIT_BETA

2559

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2560

#endif // UNIT_BIAS

2561

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2562

#if defined(MIXED_PRECISION)

2563

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2564

ADD_BLOCK(M0, c, bias_hp);

2565

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2566

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2567

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2568

2569

#endif // defined(BROADCAST_BIAS)

2570

#endif // defined(BETA)

2571

2572

#if defined(ACTIVATION_TYPE)

2573

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2574

#endif // defined(ACTIVATION_TYPE)

2575

2576

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2577

#if defined(MIXED_PRECISION)

2578

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2579

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2580

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2581

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2582

2583

#undef LHS_BLOCK_SIZE

2584

#undef LHS_OFFSET_X

2585

#undef LHS_STEP_X

2586

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(LHS_TRANSPOSE)

2592

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2593

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

2594

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2595

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

2596

2597

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

2604

({ \

2605

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2606

})

2607

#elif M0 == 2 // M0 == 2

2608

#define RHS_VFMA_M0xN0(i, a, b, c) \

2609

({ \

2610

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2611

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2612

})

2613

#elif M0 == 3 // M0 == 3

2614

#define RHS_VFMA_M0xN0(i, a, b, c) \

2615

({ \

2616

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2617

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2618

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2619

})

2620

#elif M0 == 4 // M0 == 4

2621

#define RHS_VFMA_M0xN0(i, a, b, c) \

2622

({ \

2623

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2624

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2625

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2626

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2627

})

2628

#elif M0 == 5 // M0 == 5

2629

#define RHS_VFMA_M0xN0(i, a, b, c) \

2630

({ \

2631

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2632

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2633

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2634

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2635

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2636

})

2637

#elif M0 == 6 // M0 == 6

2638

#define RHS_VFMA_M0xN0(i, a, b, c) \

2639

({ \

2640

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2641

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2642

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2643

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2644

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2645

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2646

})

2647

#elif M0 == 7 // M0 == 7

2648

#define RHS_VFMA_M0xN0(i, a, b, c) \

2649

({ \

2650

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2651

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2652

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2653

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2654

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2655

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2656

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

2657

})

2658

#elif M0 == 8 // M0 == 8

2659

#define RHS_VFMA_M0xN0(i, a, b, c) \

2660

({ \

2661

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2663

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2664

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2665

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2666

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2667

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

2668

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

2669

})

2670

#else // M0 not supported

2671

#error "M0 not supported"

2672

#endif // M0 not supported

2673

2674

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2675

* The LHS matrix is NOT reshaped

2676

* The RHS matrix is NOT reshaped

2677

*

2678

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2679

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

2680

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

2681

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2682

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

2683

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2684

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2685

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

2686

* - N0 = 2, 3, 4, 8, 16

2687

* - K0 = 2, 3, 4, 8, 16

2688

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2689

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2690

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2691

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2692

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2693

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2694

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2695

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2696

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2697

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2698

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

2699

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2700

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

2701

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2702

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

2703

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2704

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

2705

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

2706

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

2707

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

2708

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

2709

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2710

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2711

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2712

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2713

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2714

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2715

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2716

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2717

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2718

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2719

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2720

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2721

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2722

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2723

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

2724

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2725

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2726

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2727

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2728

*/

2729

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

2730

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2731

#if defined(BETA)

2732

IMAGE_DECLARATION(bias),

2733

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2734

IMAGE_DECLARATION(dst),

2735

uint lhs_stride_z,

2736

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2737

#if defined(BETA)

2738

uint bias_stride_z,

2739

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2740

uint dst_stride_z

2741

#if defined(REINTERPRET_INPUT_AS_3D)

2742

,

2743

uint lhs_cross_plane_pad

2744

#endif // REINTERPRET_INPUT_AS_3D

2745

#if defined(REINTERPRET_OUTPUT_AS_3D)

2746

,

2747

uint dst_cross_plane_pad

2748

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

2753

2754

// RHS offset and step X

2755

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2756

2757

uint x = get_global_id(0);

2758

uint y = get_global_id(1);

2759

uint z = get_global_id(2);

2760

2761

#if defined(DUMMY_WORK_ITEMS)

2762

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2767

2768

// Compute LHS matrix address

2769

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

2770

2771

// Compute RHS matrix address

2772

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

2773

2774

#if defined(MATRIX_B_DEPTH)

2775

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2776

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

2777

#else // defined(MATRIX_B_DEPTH)

2778

rhs_offset += z * rhs_stride_z;

2779

#endif // defined(MATRIX_B_DEPTH)

2780

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2781

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

2782

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2783

2784

#if defined(REINTERPRET_INPUT_AS_3D)

2785

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2786

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2787

2788

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2789

// multiply lhs_stride_z by DEPTH_GEMM3D

2790

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2791

2792

#else // defined(REINTERPRET_INPUT_AS_3D)

2793

2794

// Add offset for batched GEMM

2795

lhs_offset += z * lhs_stride_z;

2796

2797

#endif // defined(REINTERPRET_INPUT_AS_3D)

2798

2799

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2800

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2801

2802

int i = 0;

2803

for(; i <= (K - K0); i += K0)

2804

{

2805

// Supported cases (M0, K0):

2806

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2807

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2808

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2809

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2810

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2811

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2812

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2813

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

2814

// Load values from LHS matrix

2815

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

2816

2817

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2818

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2819

2820

RHS_VFMA_M0xN0(0, a, b0, c);

2821

RHS_VFMA_M0xN0(1, a, b1, c);

2822

#if K0 > 2

2823

RHS_VFMA_M0xN0(2, a, b2, c);

2824

#endif // K0 > 2

2825

#if K0 > 3

2826

RHS_VFMA_M0xN0(3, a, b3, c);

2827

#endif // K0 > 3

2828

#if K0 > 4

2829

RHS_VFMA_M0xN0(4, a, b4, c);

2830

RHS_VFMA_M0xN0(5, a, b5, c);

2831

RHS_VFMA_M0xN0(6, a, b6, c);

2832

RHS_VFMA_M0xN0(7, a, b7, c);

2833

#endif // K0 > 4

2834

#if K0 > 8

2835

RHS_VFMA_M0xN0(8, a, b8, c);

2836

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

2837

RHS_VFMA_M0xN0(A, a, bA, c);

2838

RHS_VFMA_M0xN0(B, a, bB, c);

2839

RHS_VFMA_M0xN0(C, a, bC, c);

2840

RHS_VFMA_M0xN0(D, a, bD, c);

2841

RHS_VFMA_M0xN0(E, a, bE, c);

2842

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2843

#endif // K0 > 8

2844

2845

lhs_offset += K0 * sizeof(DATA_TYPE);

2846

rhs_offset += K0 * rhs_stride_y;

2847

}

2848

2849

// Left-over accumulations

2850

for(; i < K; ++i)

2851

{

2852

// Load values from LHS matrix

2853

VEC_DATA_TYPE(DATA_TYPE, 2)

2854

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

2855

#if M0 > 1

2856

VEC_DATA_TYPE(DATA_TYPE, 2)

2857

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

2858

#endif // M0 > 1

2859

#if M0 > 2

2860

VEC_DATA_TYPE(DATA_TYPE, 2)

2861

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

2862

#endif // M0 > 2

2863

#if M0 > 3

2864

VEC_DATA_TYPE(DATA_TYPE, 2)

2865

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

2866

#endif // M0 > 3

2867

#if M0 > 4

2868

VEC_DATA_TYPE(DATA_TYPE, 2)

2869

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

2870

#endif // M0 > 4

2871

#if M0 > 5

2872

VEC_DATA_TYPE(DATA_TYPE, 2)

2873

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

2874

#endif // M0 > 5

2875

#if M0 > 6

2876

VEC_DATA_TYPE(DATA_TYPE, 2)

2877

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

2878

#endif // M0 > 6

2879

#if M0 > 7

2880

VEC_DATA_TYPE(DATA_TYPE, 2)

2881

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

2882

#endif // M0 > 7

2883

2884

VEC_DATA_TYPE(DATA_TYPE, N0)

2885

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

2886

RHS_VFMA_M0xN0(0, a, b, c);

2887

2888

lhs_offset += sizeof(DATA_TYPE);

2889

rhs_offset += rhs_stride_y;

2890

}

2891

2892

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2893

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2894

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2895

2896

#if defined(REINTERPRET_OUTPUT_AS_3D)

2897

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2898

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2899

2900

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2901

// multiply dst_stride_z by DEPTH_GEMM3D

2902

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2903

2904

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2905

2906

// Add offset for batched GEMM

2907

dst_addr += z * dst_stride_z;

2908

2909

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2910

2911

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2912

#if defined(ALPHA)

2913

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2914

#endif // defined(ALPHA)

2915

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2916

// Add beta*bias

2917

#if defined(BETA)

2918

#if defined(BROADCAST_BIAS)

2919

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2920

2921

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2922

2923

#ifndef UNIT_BETA

2924

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2925

#endif // UNIT_BIAS

2926

2927

// c = c + bias[broadcasted]

2928

ADD_BLOCK_BROADCAST(M0, c, bias0);

2929

2930

#else // defined(BROADCAST_BIAS)

2931

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2932

2) * bias_stride_z;

2933

2934

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2935

2936

#ifndef UNIT_BETA

2937

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2942

2943

#endif // defined(BROADCAST_BIAS)

2944

#endif // defined(BETA)

2945

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2946

#if defined(ACTIVATION_TYPE)

2947

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2948

#endif // defined(ACTIVATION_TYPE)

2949

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2950

// Store output block

2951

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2952

2953

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

2958

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2959

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2960

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

2961

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2962

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2963

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

2964

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

2965

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

2966

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2967

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2968

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2969

* The activation function is performed after the bias addition

2970

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2971

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2972

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2973

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2974

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2975

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2976

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2977

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2978

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2979

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2980

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2981

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2982

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2983

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2984

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2985

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2986

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2987

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2988

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2989

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2990

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

2991

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2992

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

2993

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2994

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2995

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2996

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2997

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2998

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2999

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3000

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3001

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3002

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3003

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3004

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3005

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3006

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

3007

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3008

#if defined(BETA)

3009

IMAGE_DECLARATION(src2),

3010

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3011

IMAGE_DECLARATION(dst),

3012

uint src0_stride_z,

3013

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3014

#if defined(BETA)

3015

uint src2_stride_z,

3016

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3017

uint dst_stride_z

3018

#if defined(REINTERPRET_OUTPUT_AS_3D)

3019

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3020

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3021

#endif // REINTERPRET_OUTPUT_AS_3D

3022

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3023

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3024

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3025

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3026

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3027

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3028

// Offset

3029

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3030

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3031

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3032

// src_addr_a = address of matrix A

3033

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3034

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3035

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3036

3037

#if defined(MATRIX_B_DEPTH)

3038

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3039

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3040

#else // defined(MATRIX_B_DEPTH)

3041

src1_addr_in_bytes += z * src1_stride_z;

3042

#endif // defined(MATRIX_B_DEPTH)

3043

3044

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

3045

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3046

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3047

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3048

__global float *src_end_addr_b = src_addr_b + COLS_B;

3049

3050

src_addr_a += offset_row_a;

3051

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3052

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3053

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3058

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3059

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3060

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3061

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3062

float4 a0 = vload4(0, src_addr_a);

3063

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3064

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3065

c0 += (float4)a0.s0 * b0;

3066

c1 += (float4)a0.s1 * b0;

3067

c2 += (float4)a0.s2 * b0;

3068

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3069

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3070

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3071

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

3072

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3073

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3074

c0 += (float4)a0.s0 * b0;

3075

c1 += (float4)a0.s1 * b0;

3076

c2 += (float4)a0.s2 * b0;

3077

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3078

}

3079

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3080

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3081

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3082

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3083

float4 a0 = vload4(0, src_addr_a);

3084

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3085

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3086

c0 += (float4)a0.s0 * b0;

3087

c1 += (float4)a0.s1 * b0;

3088

c2 += (float4)a0.s2 * b0;

3089

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3090

}

3091

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3092

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3093

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3094

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3095

// Compute dst address

3096

__global uchar *dst_addr = offset(&dst, 0, 0);

3097

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3098

uint4 zout = 0;

3099

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3100

#if defined(REINTERPRET_OUTPUT_AS_3D)

3101

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3102

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3103

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3108

// |******************|

3109

// | cross_plane_pad |

3110

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3115

3116

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3117

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3118

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3119

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3120

// Add offset due to the cross plane paddings

3121

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3122

3123

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3124

// multiply dst_stride_z by DEPTH_GEMM3D

3125

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3126

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3127

// Add offset for batched GEMM

3128

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3129

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3130

3131

// Multiply by the weight of matrix-matrix product and store the result

3132

#if defined(ALPHA)

3133

SCALE_BLOCK(4, float, c, ALPHA);

3134

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3139

3140

#if defined(BROADCAST_BIAS)

3141

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

3142

3143

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

3144

3145

#ifndef UNIT_BETA

3146

SCALE_BLOCK(1, float, bias, BETA);

3147

#endif // UNIT_BIAS

3148

3149

// c = c + bias[broadcasted]

3150

ADD_BLOCK_BROADCAST(4, c, bias0);

3151

3152

#else // defined(BROADCAST_BIAS)

3153

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3154

2) * src2_stride_z;

3155

3156

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

3157

3158

#ifndef UNIT_BETA

3159

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

3164

3165

#endif // defined(BROADCAST_BIAS)

3166

#endif // defined(BETA)

3167

3168

#if defined(ACTIVATION_TYPE)

3169

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

3170

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3171

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3172

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3173

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

3174

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

3175

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

3176

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3177

}

3178

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3179

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3180

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

3181

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3182

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3183

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3184

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3185

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3186

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3187

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3188

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3189

* The activation function is performed after the bias addition

3190

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3191

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3192

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3193

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3194

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3195

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3196

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

3197

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3198

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3199

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3200

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3201

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3202

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3203

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3204

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3205

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3206

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3207

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3208

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3209

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3210

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3211

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3212

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3213

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3214

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3215

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3216

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3217

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3218

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3219

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3220

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3221

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3222

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3223

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3224

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3225

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3226

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

3227

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3228

#if defined(BETA)

3229

IMAGE_DECLARATION(src2),

3230

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3231

IMAGE_DECLARATION(dst),

3232

uint src0_stride_z,

3233

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3234

#if defined(BETA)

3235

uint src2_stride_z,

3236

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3237

uint dst_stride_z

3238

#if defined(REINTERPRET_OUTPUT_AS_3D)

3239

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3240

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3241

#endif // REINTERPRET_OUTPUT_AS_3D

3242

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3243

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3244

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3245

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3246

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3247

3248

// Offset

3249

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3250

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

3251

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3252

// src_addr_a = address of matrix A

3253

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3254

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3255

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3256

3257

#if defined(MATRIX_B_DEPTH)

3258

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3259

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3260

#else // defined(MATRIX_B_DEPTH)

3261

src1_addr_in_bytes += z * src1_stride_z;

3262

#endif // defined(MATRIX_B_DEPTH)

3263

3264

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

3265

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3266

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3267

src_addr_a += offset_row_a;

3268

src_addr_b += offset_row_b;

3269

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3270

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3275

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3276

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

3277

3278

int i = 0;

3279

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3280

{

3281

// Load values from matrix A (interleaved) and matrix B (transposed)

3282

float4 a0 = vload4(0, src_addr_a);

3283

float4 b0 = vload4(0, src_addr_b);

3284

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3285

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3286

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3287

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3288

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

3289

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

3290

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

3291

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3292

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3293

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

3294

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

3295

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

3296

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3297

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3298

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

3299

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

3300

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

3301

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3302

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3303

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

3304

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

3305

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

3306

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3307

3308

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3309

a0 = vload4(0, src_addr_a);

3310

b0 = vload4(0, src_addr_b);

3311

3312

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3313

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3314

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3315

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

3316

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

3317

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

3318

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3319

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3320

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

3321

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

3322

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

3323

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3324

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3325

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

3326

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

3327

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

3328

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3329

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3330

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

3331

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

3332

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

3333

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3334

3335

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3336

a0 = vload4(0, src_addr_a);

3337

b0 = vload4(0, src_addr_b);

3338

3339

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3340

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

3341

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3342

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

3343

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

3344

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

3345

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3346

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3347

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

3348

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

3349

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

3350

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3351

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3352

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

3353

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

3354

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

3355

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3356

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3357

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

3358

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

3359

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

3360

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3361

3362

// Load values from matrix A (interleaved) and matrix B (transposed)

3363

a0 = vload4(0, src_addr_a);

3364

b0 = vload4(0, src_addr_b);

3365

3366

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3367

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3368

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3369

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

3370

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

3371

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

3372

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3373

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3374

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

3375

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

3376

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

3377

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3378

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3379

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

3380

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

3381

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

3382

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3383

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3384

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

3385

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

3386

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

3387

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3388

}

3389

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3390

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3391

{

3392

// Load values from matrix A (interleaved) and matrix B (transposed)

3393

float4 a0 = vload4(0, src_addr_a);

3394

float4 b0 = vload4(0, src_addr_b);

3395

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

3396

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3397

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

3398

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3399

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

3400

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

3401

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

3402

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3403

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3404

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

3405

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

3406

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

3407

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3408

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3409

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

3410

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

3411

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

3412

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3413

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3414

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

3415

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

3416

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

3417

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3418

}

3419

3420

// Compute destination address

3421

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3422

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3423

// Compute dst address

3424

__global uchar *dst_addr = offset(&dst, 0, 0);

3425

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3426

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3427

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3428

#if defined(REINTERPRET_OUTPUT_AS_3D)

3429

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3430

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3431

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3436

// |******************|

3437

// | cross_plane_pad |

3438

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3443

3444

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3445

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3446

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3447

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3448

// Add offset due to the cross plane paddings

3449

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3450

3451

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3452

// multiply dst_stride_z by DEPTH_GEMM3D

3453

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3454

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3455

// Add offset for batched GEMM

3456

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3457

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3458

3459

// Multiply by the weight of matrix-matrix product and store the result

3460

#if defined(ALPHA)

3461

SCALE_BLOCK(4, float, c, ALPHA);

3462

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3467

3468

#if defined(BROADCAST_BIAS)

3469

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

3470

3471

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

3472

3473

#ifndef UNIT_BETA

3474

SCALE_BLOCK(1, float, bias, BETA);

3475

#endif // UNIT_BIAS

3476

3477

// c = c + bias[broadcasted]

3478

ADD_BLOCK_BROADCAST(4, c, bias0);

3479

3480

#else // defined(BROADCAST_BIAS)

3481

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3482

2) * src2_stride_z;

3483

3484

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

3485

3486

#ifndef UNIT_BETA

3487

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

3492

3493

#endif // defined(BROADCAST_BIAS)

3494

#endif // defined(BETA)

3495

3496

#if defined(ACTIVATION_TYPE)

3497

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

3498

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3499

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3500

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3501

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

3502

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

3503

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

3504

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3505

}

3506

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

3507

// Undefine local defines

3508

#undef COLS_MTX_B

3509

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

3510

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3511

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3512

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

3513

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3514

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3515

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3516

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3517

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3518

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3519

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3520

* The activation function is performed after the bias addition

3521

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3522

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3523

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3524

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3525

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3526

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3527

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

3528

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3529

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3530

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3531

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3532

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3533

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3534

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3535

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3536

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3537

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3538

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3539

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3540

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3541

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3542

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3543

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3544

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3545

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3546

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3547

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3548

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3549

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3550

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3551

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3552

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3553

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3554

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3555

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3556

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3557

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

3558

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3559

#if defined(BETA)

3560

IMAGE_DECLARATION(src2),

3561

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3562

IMAGE_DECLARATION(dst),

3563

uint src0_stride_z,

3564

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3565

#if defined(BETA)

3566

uint src2_stride_z,

3567

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3568

uint dst_stride_z

3569

#if defined(REINTERPRET_OUTPUT_AS_3D)

3570

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3571

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3572

#endif // REINTERPRET_OUTPUT_AS_3D

3573

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3574

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3575

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3576

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3577

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3578

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3579

// Offset

3580

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3581

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3582

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3583

// src_addr_a = address of matrix A

3584

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3585

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3586

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3587

3588

#if defined(MATRIX_B_DEPTH)

3589

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3590

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3591

#else // defined(MATRIX_B_DEPTH)

3592

src1_addr_in_bytes += z * src1_stride_z;

3593

#endif // defined(MATRIX_B_DEPTH)

3594

3595

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

3596

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3597

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3598

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3599

__global half *src_end_addr_b = src_addr_b + COLS_B;

3600

3601

src_addr_a += offset_row_a;

3602

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3603

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3604

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3609

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3610

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3611

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3612

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3613

half4 a0 = vload4(0, src_addr_a);

3614

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3615

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3616

c0 += (half8)a0.s0 * b0;

3617

c1 += (half8)a0.s1 * b0;

3618

c2 += (half8)a0.s2 * b0;

3619

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3620

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3621

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3622

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

3623

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3624

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3625

c0 += (half8)a0.s0 * b0;

3626

c1 += (half8)a0.s1 * b0;

3627

c2 += (half8)a0.s2 * b0;

3628

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3629

}

3630

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3631

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3632

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3633

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3634

half4 a0 = vload4(0, src_addr_a);

3635

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3636

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3637

c0 += (half8)a0.s0 * b0;

3638

c1 += (half8)a0.s1 * b0;

3639

c2 += (half8)a0.s2 * b0;

3640

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3641

}

3642

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3643

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3644

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3645

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3646

// Compute dst address

3647

__global uchar *dst_addr = offset(&dst, 0, 0);

3648

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3649

uint4 zout = 0;

3650

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3651

#if defined(REINTERPRET_OUTPUT_AS_3D)

3652

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3653

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3654

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3659

// |******************|

3660

// | cross_plane_pad |

3661

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3666

3667

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3668

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3669

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3670

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3671

// Add offset due to the cross plane paddings

3672

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3673

3674

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3675

// multiply dst_stride_z by DEPTH_GEMM3D

3676

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3677

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3678

// Add offset for batched GEMM

3679

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3680

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3681

3682

// Multiply by the weight of matrix-matrix product and store the result

3683

#if defined(ALPHA)

3684

SCALE_BLOCK(4, half, c, ALPHA);

3685

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3690

3691

#if defined(BROADCAST_BIAS)

3692

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

3693

3694

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3695

3696

#ifndef UNIT_BETA

3697

SCALE_BLOCK(1, half, bias, BETA);

3698

#endif // UNIT_BIAS

3699

3700

// c = c + bias[broadcasted]

3701

ADD_BLOCK_BROADCAST(4, c, bias0);

3702

3703

#else // defined(BROADCAST_BIAS)

3704

3705

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3706

2) * src2_stride_z;

3707

3708

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3709

3710

#ifndef UNIT_BETA

3711

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

3716

3717

#endif // defined(BROADCAST_BIAS)

3718

#endif // defined(BETA)

3719

3720

#if defined(ACTIVATION_TYPE)

3721

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

3722

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3723

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3724

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3725

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

3726

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

3727

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

3728

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3729

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3730

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3731

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3732

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3733

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3734

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3735

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3736

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3737

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3738

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3739

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3740

* The activation function is performed after the bias addition

3741

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3742

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3743

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3744

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3745

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3746

*

3747

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

3748

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3749

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3750

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3751

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3752

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

3753

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

3754

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3755

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3756

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3757

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3758

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3759

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3760

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3761

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3762

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3763

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3764

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3765

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

3766

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3767

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3768

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3769

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3770

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3771

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3772

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3773

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3774

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3775

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3776

*/

3777

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

3778

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3779

#if defined(BETA)

3780

IMAGE_DECLARATION(src2),

3781

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3782

IMAGE_DECLARATION(dst),

3783

uint src0_stride_z,

3784

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3785

#if defined(BETA)

3786

uint src2_stride_z,

3787

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3788

uint dst_stride_z

3789

#if defined(REINTERPRET_OUTPUT_AS_3D)

3790

,

3791

uint cross_plane_pad

3792

#endif // REINTERPRET_OUTPUT_AS_3D

3793

)

3794

{

3795

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3796

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

3797

int z = get_global_id(2);

3798

3799

// Offset

3800

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3801

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

3802

3803

// src_addr_a = address of matrix A

3804

// src_addr_b = address of matrix B

3805

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3806

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3807

3808

#if defined(MATRIX_B_DEPTH)

3809

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3810

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3811

#else // defined(MATRIX_B_DEPTH)

3812

src1_addr_in_bytes += z * src1_stride_z;

3813

#endif // defined(MATRIX_B_DEPTH)

3814

3815

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

3816

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

3817

3818

// Compute end row address for matrix B

3819

__global half *src_end_addr_b = src_addr_b + COLS_B;

3820

3821

src_addr_a += offset_row_a;

3822

src_addr_b += offset_row_b;

3823

3824

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3829

3830

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

3831

{

3832

// Load values from matrix A (interleaved) and matrix B (transposed)

3833

float4 a0 = convert_float4(vload4(0, src_addr_a));

3834

float8 b0 = convert_float8(vload8(0, src_addr_b));

3835

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3836

c0 += (float8)a0.s0 * b0;

3837

c1 += (float8)a0.s1 * b0;

3838

c2 += (float8)a0.s2 * b0;

3839

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3840

3841

// Load values from matrix A (interleaved) and matrix B (transposed)

3842

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

3843

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

3844

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3845

c0 += (float8)a0.s0 * b0;

3846

c1 += (float8)a0.s1 * b0;

3847

c2 += (float8)a0.s2 * b0;

3848

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3849

}

3850

3851

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

3852

{

3853

// Load values from matrix A (interleaved) and matrix B (transposed)

3854

float4 a0 = convert_float4(vload4(0, src_addr_a));

3855

float8 b0 = convert_float8(vload8(0, src_addr_b));

3856

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3857

c0 += (float8)a0.s0 * b0;

3858

c1 += (float8)a0.s1 * b0;

3859

c2 += (float8)a0.s2 * b0;

3860

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3861

}

3862

3863

// Compute destination address

3864

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3865

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3866

// Compute dst address

3867

__global uchar *dst_addr = offset(&dst, 0, 0);

3868

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3869

uint4 zout = 0;

3870

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3871

#if defined(REINTERPRET_OUTPUT_AS_3D)

3872

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

3873

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

3879

// |******************|

3880

// | cross_plane_pad |

3881

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

3886

3887

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3888

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3889

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3890

3891

// Add offset due to the cross plane paddings

3892

zout *= (cross_plane_pad * dst_stride_y);

3893

3894

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3895

// multiply dst_stride_z by DEPTH_GEMM3D

3896

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3897

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3898

// Add offset for batched GEMM

3899

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3900

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3901

3902

// Multiply by the weight of matrix-matrix product and store the result

3903

#if defined(ALPHA)

3904

SCALE_BLOCK(4, float, c, ALPHA);

3905

#endif // defined(ALPHA)

3906

3907

#if defined(BETA)

3908

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3909

3910

#if defined(BROADCAST_BIAS)

3911

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

3912

3913

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3914

3915

float8 bias_f0 = convert_float8(bias0);

3916

3917

#ifndef UNIT_BETA

3918

SCALE_BLOCK(1, float, bias_f, BETA);

3919

#endif // UNIT_BIAS

3920

3921

// c = c + bias[broadcasted]

3922

ADD_BLOCK_BROADCAST(4, c, bias_f0);

3923

3924

#else // defined(BROADCAST_BIAS)

3925

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3926

2) * src2_stride_z;

3927

3928

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3929

3930

float8 bias_f0 = convert_float8(bias0);

3931

float8 bias_f1 = convert_float8(bias1);

3932

float8 bias_f2 = convert_float8(bias2);

3933

float8 bias_f3 = convert_float8(bias3);

3934

3935

#ifndef UNIT_BETA

3936

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

3941

3942

#endif // defined(BROADCAST_BIAS)

3943

#endif // defined(BETA)

3944

3945

half8 c_h0 = convert_half8(c0);

3946

half8 c_h1 = convert_half8(c1);

3947

half8 c_h2 = convert_half8(c2);

3948

half8 c_h3 = convert_half8(c3);

3949

3950

#if defined(ACTIVATION_TYPE)

3951

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

3952

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3953

3954

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3955

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

3956

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

3957

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

3958

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3959

}

3960

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3961

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3962

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3963

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3964

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3965

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3966

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3967

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3968

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3969

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3970

* The activation function is performed after the bias addition

3971

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3972

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3973

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3974

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3975

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3976

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3977

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

3978

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3979

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3980

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3981

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3982

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

3983

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

3984

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3985

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3986

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3987

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3988

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3989

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3990

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3991

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3992

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3993

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3994

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3995

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

3996

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3997

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3998

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3999

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4000

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4001

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4002

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

4003

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4004

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4005

*/

4006

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

4007

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4008

#if defined(BETA)

4009

IMAGE_DECLARATION(src2),

4010

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4011

IMAGE_DECLARATION(dst),

4012

uint src0_stride_z,

4013

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4014

#if defined(BETA)

4015

uint src2_stride_z,

4016

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4017

uint dst_stride_z

4018

#if defined(REINTERPRET_OUTPUT_AS_3D)

4019

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4020

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4021

#endif // REINTERPRET_OUTPUT_AS_3D

4022

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4023

{

4024

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4025

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

4026

int z = get_global_id(2);

4027

4028

// Offset

4029

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4030

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

4031

4032

// src_addr_a = address of matrix A

4033

// src_addr_b = address of matrix B

4034

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4035

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4036

4037

#if defined(MATRIX_B_DEPTH)

4038

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4039

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4040

#else // defined(MATRIX_B_DEPTH)

4041

src1_addr_in_bytes += z * src1_stride_z;

4042

#endif // defined(MATRIX_B_DEPTH)

4043

4044

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4045

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

4046

4047

// Compute end row address for matrix B

4048

__global half *src_end_addr_b = src_addr_b + COLS_B;

4049

4050

src_addr_a += offset_row_a;

4051

src_addr_b += offset_row_b;

4052

4053

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4058

4059

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

4060

4061

int i = 0;

4062

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

4063

{

4064

#if MULT_INTERLEAVE4X4_HEIGHT == 1

4065

// Load values from matrix A (interleaved) and matrix B (transposed)

4066

half8 a0 = vload8(0, src_addr_a);

4067

half8 b0 = vload8(0, src_addr_b);

4068

4069

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

4070

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4071

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4072

c0 = fma((half8)a0.s0, b0, c0);

4073

c1 = fma((half8)a0.s1, b0, c1);

4074

c2 = fma((half8)a0.s2, b0, c2);

4075

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4076

4077

// Load values from matrix B (transposed)

4078

b0 = vload8(0, src_addr_b);

4079

4080

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4081

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4082

c0 = fma((half8)a0.s4, b0, c0);

4083

c1 = fma((half8)a0.s5, b0, c1);

4084

c2 = fma((half8)a0.s6, b0, c2);

4085

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4086

4087

// Load values from matrix A (interleaved) and matrix B (transposed)

4088

a0 = vload8(0, src_addr_a);

4089

b0 = vload8(0, src_addr_b);

4090

4091

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

4092

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4093

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4094

c0 = fma((half8)a0.s0, b0, c0);

4095

c1 = fma((half8)a0.s1, b0, c1);

4096

c2 = fma((half8)a0.s2, b0, c2);

4097

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4098

4099

// Load values from matrix B (transposed)

4100

b0 = vload8(0, src_addr_b);

4101

4102

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4103

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4104

c0 = fma((half8)a0.s4, b0, c0);

4105

c1 = fma((half8)a0.s5, b0, c1);

4106

c2 = fma((half8)a0.s6, b0, c2);

4107

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4108

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

4109

// Load values from matrix A (interleaved) and matrix B (transposed)

4110

half4 a0 = vload4(0, src_addr_a);

4111

half8 b0 = vload8(0, src_addr_b);

4112

4113

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4114

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4115

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4116

c0 = fma((half8)a0.s0, b0, c0);

4117

c1 = fma((half8)a0.s1, b0, c1);

4118

c2 = fma((half8)a0.s2, b0, c2);

4119

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4120

4121

// Load values from matrix A (interleaved) and matrix B (transposed)

4122

a0 = vload4(0, src_addr_a);

4123

b0 = vload8(0, src_addr_b);

4124

4125

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4126

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4127

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4128

c0 = fma((half8)a0.s0, b0, c0);

4129

c1 = fma((half8)a0.s1, b0, c1);

4130

c2 = fma((half8)a0.s2, b0, c2);

4131

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4132

4133

// Load values from matrix A (interleaved) and matrix B (transposed)

4134

a0 = vload4(0, src_addr_a);

4135

b0 = vload8(0, src_addr_b);

4136

4137

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4138

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4139

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4140

c0 = fma((half8)a0.s0, b0, c0);

4141

c1 = fma((half8)a0.s1, b0, c1);

4142

c2 = fma((half8)a0.s2, b0, c2);

4143

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4144

4145

// Load values from matrix A (interleaved) and matrix B (transposed)

4146

a0 = vload4(0, src_addr_a);

4147

b0 = vload8(0, src_addr_b);

4148

4149

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4150

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4151

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4152

c0 = fma((half8)a0.s0, b0, c0);

4153

c1 = fma((half8)a0.s1, b0, c1);

4154

c2 = fma((half8)a0.s2, b0, c2);

4155

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4156

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

4157

}

4158

4159

for(; i < (int)(COLS_MTX_B); ++i)

4160

{

4161

// Load values from matrix A (interleaved) and matrix B (transposed)

4162

half4 a0 = vload4(0, src_addr_a);

4163

half8 b0 = vload8(0, src_addr_b);

4164

4165

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4166

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

4167

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4168

c0 = fma((half8)a0.s0, b0, c0);

4169

c1 = fma((half8)a0.s1, b0, c1);

4170

c2 = fma((half8)a0.s2, b0, c2);

4171

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4172

}

4173

4174

// Compute destination address

4175

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4176

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4177

// Compute dst address

4178

__global uchar *dst_addr = offset(&dst, 0, 0);

4179

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4180

uint4 zout = 0;

4181

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4182

#if defined(REINTERPRET_OUTPUT_AS_3D)

4183

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4184

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4185

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4190

// |******************|

4191

// | cross_plane_pad |

4192

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4197

4198

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4199

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4200

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4201

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4202

// Add offset due to the cross plane paddings

4203

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4204

4205

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4206

// multiply dst_stride_z by DEPTH_GEMM3D

4207

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4208

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4209

// Add offset for batched GEMM

4210

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4211

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4212

4213

// Multiply by the weight of matrix-matrix product and store the result

4214

#if defined(ALPHA)

4215

SCALE_BLOCK(4, half, c, ALPHA);

4216

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4221

4222

#if defined(BROADCAST_BIAS)

4223

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

4224

4225

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

4226

4227

#ifndef UNIT_BETA

4228

SCALE_BLOCK(1, half, bias, BETA);

4229

#endif // UNIT_BIAS

4230

4231

// c = c + bias[broadcasted]

4232

ADD_BLOCK_BROADCAST(4, c, bias0);

4233

4234

#else // defined(BROADCAST_BIAS)

4235

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4236

2) * src2_stride_z;

4237

4238

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

4239

4240

#ifndef UNIT_BETA

4241

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4246

4247

#endif // defined(BROADCAST_BIAS)

4248

#endif // defined(BETA)

4249

4250

#if defined(ACTIVATION_TYPE)

4251

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

4252

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4253

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4254

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4255

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

4256

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

4257

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

4258

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4259

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4260

4261

// Undefine local defines

4262

#undef COLS_MTX_B

4263

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4264

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4265

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4266

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4267

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4268

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

4269

#if defined(DATA_TYPE)

4270

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4271

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

4272

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4273

* @note This OpenCL kernel works with floating point data types (F16/F32)

4274

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

4275

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4276

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4277

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4278

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4279

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4280

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4281

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4282

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4283

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4284

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4285

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4286

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4287

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4288

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4289

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4290

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4291

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4292

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4293

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4294

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4295

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4296

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4297

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4298

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4299

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4300

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4301

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4302

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4303

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4304

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4305

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4306

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4307

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4308

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4309

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4310

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4311

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

4312

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4313

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4314

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4315

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4316

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4317

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

4318

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4319

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4320

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

4321

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4322

#if defined(BETA)

4323

IMAGE_DECLARATION(src2),

4324

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4325

IMAGE_DECLARATION(dst),

4326

uint src0_stride_z,

4327

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4328

#if defined(BETA)

4329

uint src2_stride_z,

4330

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4331

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4332

#if defined(REINTERPRET_INPUT_AS_3D)

4333

,

4334

uint src_cross_plane_pad

4335

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4336

#if defined(REINTERPRET_OUTPUT_AS_3D)

4337

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4338

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4339

#endif // REINTERPRET_OUTPUT_AS_3D

4340

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4341

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4342

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4343

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4344

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4345

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4346

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4347

// Update address for the matrix A

4348

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4349

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4350

// Update address for the matrix B

4351

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4352

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4353

#if defined(REINTERPRET_INPUT_AS_3D)

4354

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

4355

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

4361

// |******************|

4362

// | cross_plane_pad |

4363

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

4368

4369

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

4370

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4371

zin = min(DEPTH_GEMM3D - 1, zin);

4372

4373

// Add offset due to the cross plane paddings

4374

zin *= (src_cross_plane_pad * src0_stride_y);

4375

4376

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4377

// multiply src0_stride_z by DEPTH_GEMM3D

4378

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

4379

4380

#else // defined(REINTERPRET_INPUT_AS_3D)

4381

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4382

// Add offset for batched GEMM

4383

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4384

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4385

#endif // defined(REINTERPRET_INPUT_AS_3D)

4386

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4387

#if defined(MATRIX_B_DEPTH)

4388

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4389

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

4390

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4391

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4392

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4393

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4394

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

4395

4396

VECTOR_TYPE acc0 = 0.0f;

4397

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4398

VECTOR_TYPE acc1 = 0.0f;

4399

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4400

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4401

VECTOR_TYPE acc2 = 0.0f;

4402

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4403

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4404

VECTOR_TYPE acc3 = 0.0f;

4405

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4406

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

4407

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4408

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4409

#if defined(REINTERPRET_INPUT_AS_3D)

4410

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

4411

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

4412

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4413

// Load values from matrix A

4414

VEC_DATA_TYPE(DATA_TYPE, 2)

4415

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

4416

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4417

VEC_DATA_TYPE(DATA_TYPE, 2)

4418

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

4419

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4420

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4421

VEC_DATA_TYPE(DATA_TYPE, 2)

4422

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

4423

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4424

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4425

VEC_DATA_TYPE(DATA_TYPE, 2)

4426

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

4427

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4428

#endif // defined(REINTERPRET_INPUT_AS_3D)

4429

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4430

// Load values from matrix B

4431

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

4432

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4433

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4434

// Accumulate

4435

acc0 += b0 * (VECTOR_TYPE)a0.s0;

4436

acc0 += b1 * (VECTOR_TYPE)a0.s1;

4437

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4438

acc1 += b0 * (VECTOR_TYPE)a1.s0;

4439

acc1 += b1 * (VECTOR_TYPE)a1.s1;

4440

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4441

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4442

acc2 += b0 * (VECTOR_TYPE)a2.s0;

4443

acc2 += b1 * (VECTOR_TYPE)a2.s1;

4444

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4445

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4446

acc3 += b0 * (VECTOR_TYPE)a3.s0;

4447

acc3 += b1 * (VECTOR_TYPE)a3.s1;

4448

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4449

}

4450

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4451

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4452

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4453

#if defined(REINTERPRET_INPUT_AS_3D)

4454

// Load values from matrix A

4455

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

4456

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4457

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

4458

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4459

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4460

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

4461

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4462

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4463

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

4464

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4465

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4466

// Load values from matrix A

4467

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

4468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4469

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

4470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4471

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4472

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

4473

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4474

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4475

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

4476

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4477

#endif // defined(REINTERPRET_INPUT_AS_3D)

4478

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4479

// Load values from matrix B

4480

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4481

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4482

// Accumulate

4483

acc0 += b0 * (VECTOR_TYPE)a0;

4484

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4485

acc1 += b0 * (VECTOR_TYPE)a1;

4486

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4487

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4488

acc2 += b0 * (VECTOR_TYPE)a2;

4489

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4490

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4491

acc3 += b0 * (VECTOR_TYPE)a3;

4492

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4493

}

4494

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4495

int z = get_global_id(2);

4496

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4497

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4498

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4499

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4500

// Compute dst address

4501

__global uchar *dst_addr = offset(&dst, 0, 0);

4502

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4503

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4504

4505

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4506

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4507

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4508

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4509

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4514

// |******************|

4515

// | cross_plane_pad |

4516

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4521

4522

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4523

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4524

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4525

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4526

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4527

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4528

4529

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4530

// multiply dst_stride_z by DEPTH_GEMM3D

4531

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4532

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4533

// Add offset for batched GEMM

4534

dst_addr += z * dst_stride_z;

4535

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4536

4537

// Multiply by the weight of matrix-matrix product and store the result

4538

#if defined(ALPHA)

4539

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

4540

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

4545

4546

#if defined(BROADCAST_BIAS)

4547

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

4548

4549

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

4550

4551

#ifndef UNIT_BETA

4552

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4553

#endif // UNIT_BIAS

4554

4555

// c = c + bias[broadcasted]

4556

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

4557

4558

#else // defined(BROADCAST_BIAS)

4559

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

4560

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

4561

4562

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

4563

4564

#ifndef UNIT_BETA

4565

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

4570

4571

#endif // defined(BROADCAST_BIAS)

4572

#endif // defined(BETA)

4573

4574

#if defined(ACTIVATION_TYPE)

4575

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

4576

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4577

4578

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

4579

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4580

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4581

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4582

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

4583

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4584

*

4585

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

4586

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

4587

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

4588

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

4589

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4590

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4591

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4592

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4593

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4594

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4595

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4596

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4597

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4598

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4599

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4600

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4601

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4602

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4603

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4604

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4605

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4606

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4607

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

4608

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

4609

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4610

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4611

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4612

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4613

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4614

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4615

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4616

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4617

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4618

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4619

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4620

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

4621

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4622

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4623

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4624

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

4625

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4626

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4627

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4628

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4629

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4630

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

4631

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4632

*/

4633

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

4634

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4635

#if defined(BETA)

4636

IMAGE_DECLARATION(src2),

4637

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4638

IMAGE_DECLARATION(dst),

4639

uint src0_stride_z,

4640

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4641

#if defined(BETA)

4642

uint src2_stride_z,

4643

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4644

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4645

#if defined(REINTERPRET_INPUT_AS_3D)

4646

,

4647

uint src_cross_plane_pad

4648

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4649

#if defined(REINTERPRET_OUTPUT_AS_3D)

4650

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4651

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4652

#endif // REINTERPRET_OUTPUT_AS_3D

4653

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4654

{

4655

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

4656

4657

// Compute starting address for matrix A and matrix B

4658

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

4659

4660

// Update address for matrix A

4661

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

4662

4663

// Update address for matrix B

4664

src_addr.s1 += idx * sizeof(float);

4665

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4666

#if defined(REINTERPRET_INPUT_AS_3D)

4667

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

4668

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

4674

// |******************|

4675

// | cross_plane_pad |

4676

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

4681

4682

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

4683

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4684

zin = min(DEPTH_GEMM3D - 1, zin);

4685

4686

// Add offset due to the cross plane paddings

4687

zin *= (src_cross_plane_pad * src0_stride_y);

4688

4689

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4690

// multiply src0_stride_z by DEPTH_GEMM3D

4691

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

4692

4693

#else // defined(REINTERPRET_INPUT_AS_3D)

4694

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4695

// Add offset for batched GEMM

4696

src_addr.s0 += get_global_id(2) * src0_stride_z;

4697

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4698

#endif // defined(REINTERPRET_INPUT_AS_3D)

4699

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4700

#if defined(MATRIX_B_DEPTH)

4701

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4702

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

4703

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4704

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4705

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4706

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4707

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4708

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4709

4710

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4711

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4712

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4713

4714

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4715

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4716

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4717

4718

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4719

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4720

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4721

4722

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4723

int i = 0;

4724

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4725

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4726

#if defined(REINTERPRET_INPUT_AS_3D)

4727

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

4728

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

4729

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4730

// Load values from matrix A and matrix B

4731

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4732

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4733

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4734

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4735

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4736

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4737

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4738

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4739

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4740

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4741

#endif // defined(REINTERPRET_INPUT_AS_3D)

4742

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4743

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4744

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4745

4746

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4747

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

4748

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

4749

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

4750

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4751

4752

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4753

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4754

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

4755

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

4756

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

4757

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4758

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4759

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4760

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4761

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4762

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

4763

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

4764

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

4765

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4766

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4767

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4768

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4769

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4770

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

4771

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

4772

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

4773

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4774

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4775

4776

// Load values from matrix A and matrix B

4777

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4778

src_addr.s1 += src1_stride_y;

4779

4780

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4781

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

4782

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

4783

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

4784

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4785

4786

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4787

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4788

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

4789

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

4790

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

4791

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4792

4793

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4794

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4795

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4796

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

4797

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

4798

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

4799

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4800

4801

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4802

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4803

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4804

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

4805

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

4806

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

4807

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4808

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4809

4810

// Load values from matrix A and matrix B

4811

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4812

src_addr.s1 += src1_stride_y;

4813

4814

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4815

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

4816

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

4817

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

4818

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4819

4820

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4821

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4822

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

4823

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

4824

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

4825

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4826

4827

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4828

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4829

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4830

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

4831

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

4832

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

4833

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4834

4835

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4836

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4837

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4838

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

4839

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

4840

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

4841

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4842

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4843

4844

// Load values from matrix A and matrix B

4845

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4846

src_addr.s1 += src1_stride_y;

4847

4848

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4849

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

4850

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

4851

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

4852

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4853

4854

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4855

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4856

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

4857

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

4858

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

4859

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4860

4861

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4862

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4863

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4864

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

4865

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

4866

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

4867

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4868

4869

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4870

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4871

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4872

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

4873

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

4874

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

4875

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4876

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4877

4878

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4879

}

4880

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4881

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4882

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4883

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4884

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4885

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

4886

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4887

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

4888

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4889

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4890

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

4891

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4892

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4893

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

4894

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4895

#else // defined(REINTERPRET_INPUT_AS_3D)

4896

// Load values from matrix A

4897

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4898

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4899

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

4900

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4901

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4902

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

4903

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4904

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4905

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

4906

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4907

#endif // defined(REINTERPRET_INPUT_AS_3D)

4908

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4909

// Load values from matrix B

4910

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4911

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4912

4913

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4914

acc0.s0 = fma(a0, b0.s0, acc0.s0);

4915

acc0.s1 = fma(a0, b0.s1, acc0.s1);

4916

acc0.s2 = fma(a0, b0.s2, acc0.s2);

4917

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4918

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4919

acc1.s0 = fma(a1, b0.s0, acc1.s0);

4920

acc1.s1 = fma(a1, b0.s1, acc1.s1);

4921

acc1.s2 = fma(a1, b0.s2, acc1.s2);

4922

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4923

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4924

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4925

acc2.s0 = fma(a2, b0.s0, acc2.s0);

4926

acc2.s1 = fma(a2, b0.s1, acc2.s1);

4927

acc2.s2 = fma(a2, b0.s2, acc2.s2);

4928

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4929

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4930

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4931

acc3.s0 = fma(a3, b0.s0, acc3.s0);

4932

acc3.s1 = fma(a3, b0.s1, acc3.s1);

4933

acc3.s2 = fma(a3, b0.s2, acc3.s2);

4934

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4935

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4936

4937

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4938

}

4939

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4940

int z = get_global_id(2);

4941

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4942

// Compute destination address

4943

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4944

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4945

// Compute dst address

4946

__global uchar *dst_addr = offset(&dst, 0, 0);

4947

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4948

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4949

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4950

#if defined(REINTERPRET_OUTPUT_AS_3D)

4951

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4952

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4953

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4958

// |******************|

4959

// | cross_plane_pad |

4960

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4965

4966

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4967

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4968

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4969

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4970

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4971

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4972

4973

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4974

// multiply dst_stride_z by DEPTH_GEMM3D

4975

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4976

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4977

// Add offset for batched GEMM

4978

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4979

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4980

4981

// Multiply by the weight of matrix-matrix product and store the result

4982

#if defined(ALPHA)

4983

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

4984

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

4989

4990

#if defined(BROADCAST_BIAS)

4991

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4992

4993

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4994

4995

#ifndef UNIT_BETA

4996

SCALE_BLOCK(1, float, bias, BETA);

4997

#endif // UNIT_BIAS

4998

4999

// acc = acc + bias[broadcasted]

5000

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5001

5002

#else // defined(BROADCAST_BIAS)

5003

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

5004

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5005

5006

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

5007

5008

#ifndef UNIT_BETA

5009

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5014

5015

#endif // defined(BROADCAST_BIAS)

5016

#endif // defined(BETA)

5017

5018

#if defined(ACTIVATION_TYPE)

5019

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

5020

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5021

5022

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5023

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5024

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5025

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5026

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5027

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5028

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5029

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5030

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5031

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5032

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5033

}

5034

5035

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

5036

*

5037

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5038

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

5039

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5040

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

5041

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5042

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5043

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5044

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5045

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5046

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5047

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5048

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5049

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5050

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5051

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5052

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5053

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5054

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5055

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5056

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5057

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5058

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5059

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5060

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5061

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5062

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5063

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5064

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5065

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5066

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5067

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5068

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5069

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5070

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5071

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5072

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5073

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5074

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5075

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5076

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5077

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5078

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5079

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5080

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5081

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5082

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5083

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5084

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5085

*/

5086

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

5087

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5088

#if defined(BETA)

5089

IMAGE_DECLARATION(src2),

5090

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5091

IMAGE_DECLARATION(dst),

5092

uint src0_stride_z,

5093

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5094

#if defined(BETA)

5095

uint src2_stride_z,

5096

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5097

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5098

#if defined(REINTERPRET_INPUT_AS_3D)

5099

,

5100

uint src_cross_plane_pad

5101

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5102

#if defined(REINTERPRET_OUTPUT_AS_3D)

5103

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5104

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5105

#endif // REINTERPRET_OUTPUT_AS_3D

5106

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5107

{

5108

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5109

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5110

5111

// Compute starting address for matrix A and Matrix B

5112

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5113

5114

// Update address for the matrix A

5115

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5116

5117

// Update address for the matrix B

5118

src_addr.s1 += idx * sizeof(float);

5119

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5120

#if defined(REINTERPRET_INPUT_AS_3D)

5121

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5122

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5128

// |******************|

5129

// | cross_plane_pad |

5130

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5135

5136

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5137

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5138

zin = min(DEPTH_GEMM3D - 1, zin);

5139

5140

// Add offset due to the cross plane paddings

5141

zin *= (src_cross_plane_pad * src0_stride_y);

5142

5143

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5144

// multiply src0_stride_z by DEPTH_GEMM3D

5145

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5146

5147

#else // defined(REINTERPRET_INPUT_AS_3D)

5148

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5149

// Add offset for batched GEMM

5150

src_addr.s0 += get_global_id(2) * src0_stride_z;

5151

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5152

#endif // defined(REINTERPRET_INPUT_AS_3D)

5153

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5154

#if defined(MATRIX_B_DEPTH)

5155

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5156

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5157

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5158

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5159

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5160

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5161

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5162

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5163

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5164

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5165

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5166

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5167

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5168

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5169

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5170

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5171

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5172

5173

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5174

int i = 0;

5175

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5176

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5177

#if defined(REINTERPRET_INPUT_AS_3D)

5178

// Load values from matrix A

5179

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

5180

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5181

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5182

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5183

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5184

5185

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5186

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5187

src_addr.s1 += src1_stride_y;

5188

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5189

src_addr.s1 += src1_stride_y;

5190

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5191

src_addr.s1 += src1_stride_y;

5192

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5193

src_addr.s1 += src1_stride_y;

5194

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5195

src_addr.s1 += src1_stride_y;

5196

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5197

src_addr.s1 += src1_stride_y;

5198

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5199

src_addr.s1 += src1_stride_y;

5200

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

5201

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5202

5203

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5204

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

5205

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

5206

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

5207

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

5208

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

5209

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

5210

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

5211

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5212

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5213

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

5214

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

5215

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

5216

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

5217

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

5218

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

5219

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

5220

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5221

5222

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5223

#if defined(REINTERPRET_INPUT_AS_3D)

5224

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5225

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5226

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5227

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5228

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

5229

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

5230

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

5231

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

5232

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

5233

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

5234

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

5235

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5236

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5237

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

5238

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

5239

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

5240

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

5241

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

5242

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

5243

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

5244

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5245

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5246

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5247

#if defined(REINTERPRET_INPUT_AS_3D)

5248

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5249

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5250

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5251

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5252

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

5253

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

5254

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

5255

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

5256

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

5257

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

5258

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

5259

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5260

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5261

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

5262

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

5263

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

5264

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

5265

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

5266

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

5267

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

5268

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5269

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5270

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5271

#if defined(REINTERPRET_INPUT_AS_3D)

5272

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5273

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5274

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5275

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5276

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

5277

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

5278

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

5279

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

5280

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

5281

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

5282

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

5283

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5284

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5285

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

5286

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

5287

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

5288

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

5289

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

5290

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

5291

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

5292

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5294

5295

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5296

}

5297

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5298

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5299

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5300

#if defined(REINTERPRET_INPUT_AS_3D)

5301

// Load values from matrix A

5302

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5303

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5304

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5305

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5306

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5307

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5308

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5309

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5310

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5311

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5312

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5313

// Load values from matrix A

5314

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5315

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5316

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5317

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5318

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5319

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5320

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5321

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5322

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5323

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5324

#endif // defined(REINTERPRET_INPUT_AS_3D)

5325

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5326

// Load values from matrix B

5327

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5328

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5329

5330

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5331

acc0.s0 = fma(a0, b0.s0, acc0.s0);

5332

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5333

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5334

acc1.s0 = fma(a1, b0.s0, acc1.s0);

5335

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5337

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5338

acc2.s0 = fma(a2, b0.s0, acc2.s0);

5339

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5341

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5342

acc3.s0 = fma(a3, b0.s0, acc3.s0);

5343

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5344

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

5345

5346

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5347

}

5348

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5349

int z = get_global_id(2);

5350

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5351

// Compute destination address

5352

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5353

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5354

// Compute dst address

5355

__global uchar *dst_addr = offset(&dst, 0, 0);

5356

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5357

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5358

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5359

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5360

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5361

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5362

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5363

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5368

// |******************|

5369

// | cross_plane_pad |

5370

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5375

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5376

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5377

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5378

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5379

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5380

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5381

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5382

5383

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5384

// multiply dst_stride_z by DEPTH_GEMM3D

5385

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5386

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5387

// Add offset for batched GEMM

5388

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5389

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5390

5391

// Multiply by the weight of matrix-matrix product and store the result

5392

#if defined(ALPHA)

5393

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

5394

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5399

5400

#if defined(BROADCAST_BIAS)

5401

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

5402

5403

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

5404

5405

#ifndef UNIT_BETA

5406

SCALE_BLOCK(1, float, bias, BETA);

5407

#endif // UNIT_BIAS

5408

5409

// acc = acc + bias[broadcasted]

5410

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5411

5412

#else // defined(BROADCAST_BIAS)

5413

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

5414

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5415

5416

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

5417

5418

#ifndef UNIT_BETA

5419

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5424

5425

#endif // defined(BROADCAST_BIAS)

5426

#endif // defined(BETA)

5427

5428

#if defined(ACTIVATION_TYPE)

5429

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

5430

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5431

5432

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5433

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5434

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5435

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5436

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5437

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5438

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5439

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5440

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5441

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5442

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5443

}

5444

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

5445

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5446

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

5447

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5448

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

5449

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5450

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5451

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5452

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5453

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5454

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5455

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5456

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5457

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5458

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5459

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

5460

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5461

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5462

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5463

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5464

*

5465

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5466

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5467

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5468

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5469

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5470

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5471

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5472

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5473

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5474

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5475

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5476

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5477

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5478

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5479

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5480

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5481

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5482

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5483

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5484

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5485

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5486

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5487

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5488

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5489

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5490

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5491

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5492

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5493

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5494

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5495

*/

5496

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

5497

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5498

#if defined(BETA)

5499

IMAGE_DECLARATION(src2),

5500

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5501

IMAGE_DECLARATION(dst),

5502

uint src0_stride_z,

5503

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5504

#if defined(BETA)

5505

uint src2_stride_z,

5506

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5507

uint dst_stride_z

5508

#if defined(REINTERPRET_INPUT_AS_3D)

5509

,

5510

uint src_cross_plane_pad

5511

#endif // REINTERPRET_INPUT_AS_3D

5512

#if defined(REINTERPRET_OUTPUT_AS_3D)

5513

,

5514

uint dst_cross_plane_pad

5515

#endif // REINTERPRET_OUTPUT_AS_3D

5516

)

5517

{

5518

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5519

5520

// Compute starting address for matrix A and Matrix B

5521

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5522

5523

// Update address for the matrix A

5524

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5525

5526

// Update address for the matrix B

5527

src_addr.s1 += idx * sizeof(half);

5528

5529

#if defined(REINTERPRET_INPUT_AS_3D)

5530

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5531

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5537

// |******************|

5538

// | cross_plane_pad |

5539

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5544

5545

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5546

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5547

zin = min(DEPTH_GEMM3D - 1, zin);

5548

5549

// Add offset due to the cross plane paddings

5550

zin *= (src_cross_plane_pad * src0_stride_y);

5551

5552

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5553

// multiply src0_stride_z by DEPTH_GEMM3D

5554

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5555

5556

#else // defined(REINTERPRET_INPUT_AS_3D)

5557

5558

// Add offset for batched GEMM

5559

src_addr.s0 += get_global_id(2) * src0_stride_z;

5560

5561

#endif // defined(REINTERPRET_INPUT_AS_3D)

5562

5563

#if defined(MATRIX_B_DEPTH)

5564

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5565

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5566

#else // defined(MATRIX_B_DEPTH)

5567

src_addr.s1 += get_global_id(2) * src1_stride_z;

5568

#endif // defined(MATRIX_B_DEPTH)

5569

5570

float8 acc0 = 0.0h;

5571

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5572

float8 acc1 = 0.0h;

5573

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5574

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5575

float8 acc2 = 0.0h;

5576

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5577

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5578

float8 acc3 = 0.0h;

5579

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5580

5581

int i = 0;

5582

for(; i <= ((int)COLS_A - 4); i += 4)

5583

{

5584

#if defined(REINTERPRET_INPUT_AS_3D)

5585

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5586

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5587

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5588

// Load values from matrix A

5589

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5590

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5591

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5592

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5593

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5594

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5595

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5597

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5598

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5599

#endif // defined(REINTERPRET_INPUT_AS_3D)

5600

5601

// Load values from matrix B

5602

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5603

src_addr.s1 += src1_stride_y;

5604

5605

// Accumulate

5606

acc0 = fma(b0, (float8)a0.s0, acc0);

5607

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5608

acc1 = fma(b0, (float8)a1.s0, acc1);

5609

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5610

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5611

acc2 = fma(b0, (float8)a2.s0, acc2);

5612

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5613

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5614

acc3 = fma(b0, (float8)a3.s0, acc3);

5615

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5616

5617

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5618

src_addr.s1 += src1_stride_y;

5619

acc0 = fma(b0, (float8)a0.s1, acc0);

5620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5621

acc1 = fma(b0, (float8)a1.s1, acc1);

5622

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5623

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5624

acc2 = fma(b0, (float8)a2.s1, acc2);

5625

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5626

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5627

acc3 = fma(b0, (float8)a3.s1, acc3);

5628

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5629

5630

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5631

src_addr.s1 += src1_stride_y;

5632

acc0 = fma(b0, (float8)a0.s2, acc0);

5633

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5634

acc1 = fma(b0, (float8)a1.s2, acc1);

5635

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5636

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5637

acc2 = fma(b0, (float8)a2.s2, acc2);

5638

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5640

acc3 = fma(b0, (float8)a3.s2, acc3);

5641

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5642

5643

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5644

src_addr.s1 += src1_stride_y;

5645

acc0 = fma(b0, (float8)a0.s3, acc0);

5646

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5647

acc1 = fma(b0, (float8)a1.s3, acc1);

5648

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5649

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5650

acc2 = fma(b0, (float8)a2.s3, acc2);

5651

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5652

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5653

acc3 = fma(b0, (float8)a3.s3, acc3);

5654

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5655

5656

src_addr.s0 += 4 * sizeof(half);

5657

}

5658

5659

for(; i < (int)COLS_A; ++i)

5660

{

5661

#if defined(REINTERPRET_INPUT_AS_3D)

5662

// Load values from matrix A

5663

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5664

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5665

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5666

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5667

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5668

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5669

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5670

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5671

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5672

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5673

#else // defined(REINTERPRET_INPUT_AS_3D)

5674

// Load values from matrix A

5675

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5676

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5677

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5678

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5679

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5680

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5681

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5682

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5683

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5684

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5685

#endif // defined(REINTERPRET_INPUT_AS_3D)

5686

5687

// Load values from matrix B

5688

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5689

5690

src_addr += (int2)(sizeof(half), src1_stride_y);

5691

5692

// Accumulate

5693

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

5694

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5695

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

5696

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5697

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5698

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

5699

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5700

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5701

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

5702

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5703

}

5704

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5705

int z = get_global_id(2);

5706

5707

// Compute destination address

5708

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5709

5710

// Compute dst address

5711

__global uchar *dst_addr = offset(&dst, 0, 0);

5712

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5713

uint4 zout = 0;

5714

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5715

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5716

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5717

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5718

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5724

// |******************|

5725

// | cross_plane_pad |

5726

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5731

5732

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5733

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5734

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5735

5736

// Add offset due to the cross plane paddings

5737

zout *= (dst_cross_plane_pad * dst_stride_y);

5738

5739

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5740

// multiply dst_stride_z by DEPTH_GEMM3D

5741

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5742

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5743

// Add offset for batched GEMM

5744

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5745

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5746

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5747

// Multiply by the weight of matrix-matrix product and store the result

5748

#if defined(ALPHA)

5749

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

5750

#endif // defined(ALPHA)

5751

5752

#if defined(BETA)

5753

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5754

5755

#if defined(BROADCAST_BIAS)

5756

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5757

5758

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5759

5760

float8 bias_f0 = convert_float8(bias0);

5761

5762

#ifndef UNIT_BETA

5763

SCALE_BLOCK(1, float, bias_f, BETA);

5764

#endif // UNIT_BIAS

5765

5766

// acc = acc + bias[broadcasted]

5767

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

5768

5769

#else // defined(BROADCAST_BIAS)

5770

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

5771

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5772

5773

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5774

5775

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5777

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5778

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5780

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5783

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5784

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5785

5786

#ifndef UNIT_BETA

5787

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

5792

5793

#endif // defined(BROADCAST_BIAS)

5794

#endif // defined(BETA)

5795

5796

half8 acc_h0 = convert_half8(acc0);

5797

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5798

half8 acc_h1 = convert_half8(acc1);

5799

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5800

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5801

half8 acc_h2 = convert_half8(acc2);

5802

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5803

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5804

half8 acc_h3 = convert_half8(acc3);

5805

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5806

5807

#if defined(ACTIVATION_TYPE)

5808

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

5809

#endif // defined(ACTIVATION_TYPE)

5810

5811

// Store the output block

5812

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5813

}

5814

5815

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

5816

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5817

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

5818

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5819

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5820

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5821

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5822

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5823

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5824

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5825

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5826

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5827

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5828

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5829

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5830

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5831

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5832

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5833

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5834

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5835

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5836

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5837

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5838

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5839

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5840

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5841

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5842

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5843

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5844

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5845

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5846

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5847

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5848

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5849

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5850

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5851

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5852

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5853

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5854

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5855

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5856

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5857

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5858

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5859

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5860

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5861

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5862

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5863

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5864

*/

5865

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

5866

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5867

#if defined(BETA)

5868

IMAGE_DECLARATION(src2),

5869

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5870

IMAGE_DECLARATION(dst),

5871

uint src0_stride_z,

5872

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5873

#if defined(BETA)

5874

uint src2_stride_z,

5875

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5876

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5877

#if defined(REINTERPRET_INPUT_AS_3D)

5878

,

5879

uint src_cross_plane_pad

5880

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5881

#if defined(REINTERPRET_OUTPUT_AS_3D)

5882

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5883

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5884

#endif // REINTERPRET_OUTPUT_AS_3D

5885

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5886

{

5887

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5888

5889

// Compute starting address for matrix A and Matrix B

5890

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5891

5892

// Update address for the matrix A

5893

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5894

5895

// Update address for the matrix B

5896

src_addr.s1 += idx * sizeof(half);

5897

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5898

#if defined(REINTERPRET_INPUT_AS_3D)

5899

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5900

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5906

// |******************|

5907

// | cross_plane_pad |

5908

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5913

5914

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5915

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5916

zin = min(DEPTH_GEMM3D - 1, zin);

5917

5918

// Add offset due to the cross plane paddings

5919

zin *= (src_cross_plane_pad * src0_stride_y);

5920

5921

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5922

// multiply src0_stride_z by DEPTH_GEMM3D

5923

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5924

5925

#else // defined(REINTERPRET_INPUT_AS_3D)

5926

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5927

// Add offset for batched GEMM

5928

src_addr.s0 += get_global_id(2) * src0_stride_z;

5929

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5930

#endif // defined(REINTERPRET_INPUT_AS_3D)

5931

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5932

#if defined(MATRIX_B_DEPTH)

5933

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5934

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5935

#else // defined(MATRIX_B_DEPTH)

5936

src_addr.s1 += get_global_id(2) * src1_stride_z;

5937

#endif // defined(MATRIX_B_DEPTH)

5938

5939

half8 acc0 = 0.0h;

5940

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5941

half8 acc1 = 0.0h;

5942

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5943

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5944

half8 acc2 = 0.0h;

5945

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5946

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5947

half8 acc3 = 0.0h;

5948

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5949

5950

int i = 0;

5951

for(; i <= ((int)COLS_A - 4); i += 4)

5952

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5953

#if defined(REINTERPRET_INPUT_AS_3D)

5954

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5955

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5956

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5957

// Load values from matrix A

5958

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5959

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5960

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5961

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5962

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5963

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5964

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5965

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5966

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5967

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5968

#endif // defined(REINTERPRET_INPUT_AS_3D)

5969

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5970

// Load values from matrix B

5971

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5972

src_addr.s1 += src1_stride_y;

5973

5974

// Accumulate

5975

acc0 = fma(b0, (half8)a0.s0, acc0);

5976

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5977

acc1 = fma(b0, (half8)a1.s0, acc1);

5978

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5979

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5980

acc2 = fma(b0, (half8)a2.s0, acc2);

5981

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5982

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5983

acc3 = fma(b0, (half8)a3.s0, acc3);

5984

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5985

5986

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5987

src_addr.s1 += src1_stride_y;

5988

acc0 = fma(b0, (half8)a0.s1, acc0);

5989

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5990

acc1 = fma(b0, (half8)a1.s1, acc1);

5991

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5992

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5993

acc2 = fma(b0, (half8)a2.s1, acc2);

5994

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5995

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5996

acc3 = fma(b0, (half8)a3.s1, acc3);

5997

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5998

5999

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

6000

src_addr.s1 += src1_stride_y;

6001

acc0 = fma(b0, (half8)a0.s2, acc0);

6002

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6003

acc1 = fma(b0, (half8)a1.s2, acc1);

6004

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6005

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6006

acc2 = fma(b0, (half8)a2.s2, acc2);

6007

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6008

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6009

acc3 = fma(b0, (half8)a3.s2, acc3);

6010

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6011

6012

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

6013

src_addr.s1 += src1_stride_y;

6014

acc0 = fma(b0, (half8)a0.s3, acc0);

6015

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6016

acc1 = fma(b0, (half8)a1.s3, acc1);

6017

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6018

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6019

acc2 = fma(b0, (half8)a2.s3, acc2);

6020

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6021

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6022

acc3 = fma(b0, (half8)a3.s3, acc3);

6023

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6024

6025

src_addr.s0 += 4 * sizeof(half);

6026

}

6027

6028

for(; i < (int)COLS_A; ++i)

6029

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6030

#if defined(REINTERPRET_INPUT_AS_3D)

6031

// Load values from matrix A

6032

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6033

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6034

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6035

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6036

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6037

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6038

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6039

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6040

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6041

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6042

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6043

// Load values from matrix A

6044

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6045

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6046

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6047

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6048

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6049

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6050

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6051

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6052

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6053

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6054

#endif // defined(REINTERPRET_INPUT_AS_3D)

6055

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6056

// Load values from matrix B

6057

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

6058

6059

src_addr += (int2)(sizeof(half), src1_stride_y);

6060

6061

// Accumulate

6062

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

6063

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6064

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

6065

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6066

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6067

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

6068

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6069

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6070

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

6071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6072

}

6073

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6074

int z = get_global_id(2);

6075

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6076

// Compute destination address

6077

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6078

6079

// Compute dst address

6080

__global uchar *dst_addr = offset(&dst, 0, 0);

6081

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6082

uint4 zout = 0;

6083

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6084

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6085

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6086

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6087

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6088

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6093

// |******************|

6094

// | cross_plane_pad |

6095

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6100

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6101

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6102

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6103

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6104

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6105

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6106

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6107

6108

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6109

// multiply dst_stride_z by DEPTH_GEMM3D

6110

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6111

#else // defined(REINTERPRET_OUTPUT_AS_3D)

6112

// Add offset for batched GEMM

6113

dst_addr += z * dst_stride_z;

6114

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6115

6116

// Multiply by the weight of matrix-matrix product and store the result

6117

#if defined(ALPHA)

6118

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

6119

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6124

6125

#if defined(BROADCAST_BIAS)

6126

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

6127

6128

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

6129

6130

#ifndef UNIT_BETA

6131

SCALE_BLOCK(1, half, bias, BETA);

6132

#endif // UNIT_BIAS

6133

6134

// acc = acc + bias[broadcasted]

6135

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6136

6137

#else // defined(BROADCAST_BIAS)

6138

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

6139

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6140

6141

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

6142

6143

#ifndef UNIT_BETA

6144

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6149

6150

#endif // defined(BROADCAST_BIAS)

6151

#endif // defined(BETA)

6152

6153

#if defined(ACTIVATION_TYPE)

6154

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

6155

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6156

6157

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6158

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6159

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6160

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6161

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

6162

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6163

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6164

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6165

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

6166

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

6167

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6168

*

6169

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

6170

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

6171

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6172

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

6173

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6174

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

6175

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6176

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

6177

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6178

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6179

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6180

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6181

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6182

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6183

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6184

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6185

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6186

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

6187

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6188

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6189

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6190

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

6191

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6192

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6193

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6194

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

6195

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6196

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6197

float4 c = vload4(0, (__global float *)src.ptr);

6198

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6199

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6200

float4 out = alpha_ab + (float4)BETA * c;

6201

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6202

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6203

vstore4(out, 0, (__global float *)dst.ptr);

6204

}

6205

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

6206

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6207

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

6208

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

6209

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

6210

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6211

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

6212

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

6213

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6214

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

6215

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6216

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

6217

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6218

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

6219

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6220

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6221

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6222

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6223

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6224

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6225

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6226

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6227

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6228

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

6229

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6230

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6231

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6232

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

6233

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6234

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6235

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6236

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

6237

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6238

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6239

half8 c = vload8(0, (__global half *)src.ptr);

6240

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6241

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6242

half8 out = alpha_ab + (half8)BETA * c;

6243

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6244

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6245

vstore8(out, 0, (__global half *)dst.ptr);

6246

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

6247

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6248

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6249

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6250

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6251

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

6252

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

6253

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6254

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

6255

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6256

*

6257

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

6258

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6259

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6260

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6261

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6262

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

6263

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6264

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6265

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6266

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6267

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6268

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

6269

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

6270

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

6271

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6272

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6273

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6274

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6275

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6276

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6277

*/

6278

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

6279

TENSOR3D_DECLARATION(src1),

6280

IMAGE_DECLARATION(dst))

6281

{

6282

int idx = get_global_id(0) * 4;

6283

int idy = get_global_id(1);

6284

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6285

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6286

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

6287

src_addr.s1 += idx * sizeof(float);

6288

6289

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

6293

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6294

{

6295

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

6296

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6297

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

6298

6299

acc += b0 * (float4)a0.s0;

6300

acc += b1 * (float4)a0.s1;

6301

}

6302

6303

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

6304

{

6305

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

6306

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6307

6308

acc += b0 * (float4)a0;

6309

}

6310

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6311

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

6312

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6313

6314

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

6315

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6316

#endif // defined(WIDTH_VECTOR_A)

6317

6318

/** This kernel accumulates each row with the biases vector.

6319

*

6320

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.

6321

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.

6322

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

6323

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6324

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

6325

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

6326

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

6327

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6328

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

6329

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

6330

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

6331

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

6332

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

6333

*/

6334

#if defined(DATA_TYPE) && defined(VECTOR_SIZE)

6335

__kernel void gemm_accumulate_biases(

6336

IMAGE_DECLARATION(accum),

6337

VECTOR_DECLARATION(biases))

6338

{

6339

Image accum = CONVERT_TO_IMAGE_STRUCT(accum);

6340

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

6341

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6342

// Vector size, e.g. number of vector elements.

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6343

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

6344

accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);

6345

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

6346

biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

6347

accum_value = biases_value + accum_value;

Anton Lokhmotov