Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2019-07-19 09:54:47 +0100

[diff] [blame]

49

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

50

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

51

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

52

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

53

* @note Only the following values for M0, K0 and V0 are supported:

54

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

55

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

56

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

57

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

58

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

59

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

60

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

61

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

62

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

63

*

64

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

65

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

66

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

67

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

68

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

69

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

70

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

71

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

72

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

73

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

74

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

75

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

76

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

77

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

78

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

79

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

80

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

81

*/

82

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

83

TENSOR3D_DECLARATION(dst)

84

#if defined(REINTERPRET_INPUT_AS_3D)

85

,

86

uint cross_plane_pad

87

#endif // REINTERPRET_INPUT_AS_3D

88

)

89

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

90

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

91

#define BLOCK_SIZE ((M0) * (K0))

92

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

93

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

#if defined(INTERLEAVE)

95

#define OUTPUT_OFFSET_X (K0)

96

#else // defined(INTERLEAVE)

97

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

98

#endif // defined(INTERLEAVE)

99

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

100

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

101

#if defined(INTERLEAVE)

102

#define OUTPUT_STEP_X (K0) * (V0)

103

#else // Do not interleave

104

#define OUTPUT_STEP_X (K0)

105

#endif // defined(INTERLEAVE)

106

107

// Compute source and destination addresses

108

uint x = get_global_id(0);

109

uint y = get_global_id(1);

110

uint z = get_global_id(2);

111

112

// ------------------ Compute input/output addresses ---------------------------

113

114

// Compute the input address

115

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

116

117

// Compute the output address

118

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

119

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

120

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

121

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

122

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

124

#if defined(REINTERPRET_INPUT_AS_3D)

125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

126

// multiply src_stride_z by DEPTH_GEMM3D

127

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

128

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

129

130

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

131

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

132

133

#else // defined(REINTERPRET_INPUT_AS_3D)

134

135

input_ptr += z * (uint)src_stride_z;

136

137

#endif // defined(REINTERPRET_INPUT_AS_3D)

138

139

// Add offset for batched GEMM

140

output_ptr += z * (uint)dst_stride_z;

141

142

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

143

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

144

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

145

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

146

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

147

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

148

#endif // M0 > 1

149

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

150

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

151

#endif // M0 > 2

152

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

153

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

154

#endif // M0 > 3

155

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

156

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

#endif // M0 > 4

158

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

159

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

160

#endif // M0 > 5

161

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

162

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

163

#endif // M0 > 6

164

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

165

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

166

#endif // M0 > 7

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

167

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

168

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

169

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

170

171

#undef BLOCK_SIZE

172

#undef OUTPUT_OFFSET_X

173

#undef OUTPUT_STEP_X

174

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

175

176

#if M0 == 2

177

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

178

({ \

179

VEC_DATA_TYPE(DATA_TYPE, M0) \

180

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

181

VSTORE(M0) \

182

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

183

})

184

#elif M0 == 3 // M0 == 3

185

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

186

({ \

187

VEC_DATA_TYPE(DATA_TYPE, M0) \

188

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

189

VSTORE(M0) \

190

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

191

})

192

#elif M0 == 4 // M0 == 4

193

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

194

({ \

195

VEC_DATA_TYPE(DATA_TYPE, M0) \

196

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

197

VSTORE(M0) \

198

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

199

})

200

#elif M0 == 5 // M0 == 5

201

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

202

({ \

203

VEC_DATA_TYPE(DATA_TYPE, 4) \

204

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

205

DATA_TYPE res1 = a4.s##i; \

206

VSTORE(4) \

207

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

208

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

209

})

210

#elif M0 == 6 // M0 == 6

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

VEC_DATA_TYPE(DATA_TYPE, 2) \

216

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

217

VSTORE(4) \

218

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

219

VSTORE(2) \

220

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

221

})

222

#elif M0 == 7 // M0 == 7

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 3) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(3) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 8 // M0 == 8

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, M0) \

238

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

239

VSTORE(M0) \

240

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

})

242

#else // M0 not supported

243

#error "M0 value not supported"

244

#endif // N0 conditions

245

246

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

247

* the output matrix unrolling the values.

248

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

249

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

250

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

251

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

252

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

253

* @note Only the following values for M0, K0 and V0 are supported:

254

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

255

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

256

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

257

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

258

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

259

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

260

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

261

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

262

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

263

*

264

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

265

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

272

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

273

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

278

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

280

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

281

*/

282

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

283

TENSOR3D_DECLARATION(dst)

284

#if defined(REINTERPRET_INPUT_AS_3D)

285

,

286

uint cross_plane_pad

287

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

292

293

// Output offset X

294

#if defined(INTERLEAVE)

295

#define OUTPUT_OFFSET_X (M0)

296

#else // defined(INTERLEAVE)

297

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

298

#endif // defined(INTERLEAVE)

299

300

// Output step X

301

#if defined(INTERLEAVE)

302

#define OUTPUT_STEP_X (M0) * (V0)

303

#else // Do not interleave

304

#define OUTPUT_STEP_X (M0)

305

#endif // defined(INTERLEAVE)

306

307

// Compute source and destination addresses

308

uint x = get_global_id(0);

309

uint y = get_global_id(1);

310

uint z = get_global_id(2);

311

312

// ------------------ Compute input/output addresses ---------------------------

313

314

// Compute the input address

315

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

316

317

// Compute the output address

318

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

319

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

320

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

321

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

322

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

323

324

#if defined(REINTERPRET_INPUT_AS_3D)

325

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

326

// multiply src_stride_z by DEPTH_GEMM3D

327

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

328

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

329

330

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

331

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

332

333

#else // defined(REINTERPRET_INPUT_AS_3D)

334

335

input_ptr += z * (uint)src_stride_z;

336

337

#endif // defined(REINTERPRET_INPUT_AS_3D)

338

339

// Add offset for batched GEMM

340

output_ptr += z * (uint)dst_stride_z;

341

342

// ---------------------------Load input values --------------------------------

343

344

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

345

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

346

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

347

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

348

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

349

#endif // M0 > 1

350

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

351

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

352

#endif // M0 > 2

353

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

354

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

355

#endif // M0 > 3

356

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

357

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

358

#endif // M0 > 4

359

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

360

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

361

#endif // M0 > 5

362

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

363

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

364

#endif // M0 > 6

365

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

366

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

367

#endif // M0 > 7

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

368

// ---------------------------Transpose and store block -----------------------

369

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

372

#if K0 > 2

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

374

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

375

#if K0 > 3

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

377

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

378

#if K0 > 4

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

383

#endif // K0 > 4

384

#if K0 > 8

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

387

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

388

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

389

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

390

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

391

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

392

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

397

#undef OUTPUT_STEP_X

398

}

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

399

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

400

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

402

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

403

* the output matrix unrolling the values.

404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

405

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

407

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

408

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

409

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

410

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

411

* N0: 2,3,4,8,16

412

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

413

* H0: greater than 0

414

*

415

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

416

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

423

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

424

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

429

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

431

*/

432

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

433

TENSOR3D_DECLARATION(dst))

434

{

435

// Block size

436

#define BLOCK_SIZE ((K0) * (N0))

437

438

// Output offset X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_OFFSET_X (N0)

441

#else // defined(INTERLEAVE)

442

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

443

#endif // defined(INTERLEAVE)

444

445

// Output step X

446

#if defined(INTERLEAVE)

447

#define OUTPUT_STEP_X (N0) * (H0)

448

#else // Do not interleave

449

#define OUTPUT_STEP_X (N0)

450

#endif // defined(INTERLEAVE)

451

452

// Compute source and destination addresses

453

uint x = get_global_id(0);

454

uint y = get_global_id(1);

455

uint z = get_global_id(2);

456

457

// ------------------ Compute input/output addresses ---------------------------

458

459

// Compute the input address

460

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

461

462

// Compute the output address

463

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

464

x / (uint)H0)

465

* (uint)dst_stride_y)

466

+ z * (uint)dst_stride_z;

467

468

// ---------------------------Load input values --------------------------------

469

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

470

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

471

472

// Load values from the RHS matrix

473

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

474

#if K0 > 1

475

if(y * (uint)K0 + 1 < SRC_HEIGHT)

476

{

477

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

482

{

483

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

484

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

485

#endif // K0 > 2

486

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

487

if(y * (uint)K0 + 3 < SRC_HEIGHT)

488

{

489

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

490

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

491

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

492

#if K0 > 4

493

if(y * (uint)K0 + 4 < SRC_HEIGHT)

494

{

495

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

496

}

497

if(y * (uint)K0 + 5 < SRC_HEIGHT)

498

{

499

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

500

}

501

if(y * (uint)K0 + 6 < SRC_HEIGHT)

502

{

503

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

504

}

505

if(y * (uint)K0 + 7 < SRC_HEIGHT)

506

{

507

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

508

}

509

#endif // K0 > 4

510

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

511

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

512

{

513

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

514

}

515

if(y * (uint)K0 + 9 < SRC_HEIGHT)

516

{

517

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

518

}

519

if(y * (uint)K0 + 10 < SRC_HEIGHT)

520

{

521

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

522

}

523

if(y * (uint)K0 + 11 < SRC_HEIGHT)

524

{

525

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

526

}

527

if(y * (uint)K0 + 12 < SRC_HEIGHT)

528

{

529

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

530

}

531

if(y * (uint)K0 + 13 < SRC_HEIGHT)

532

{

533

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

534

}

535

if(y * (uint)K0 + 14 < SRC_HEIGHT)

536

{

537

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

538

}

539

if(y * (uint)K0 + 15 < SRC_HEIGHT)

540

{

541

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

546

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

547

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

548

549

#undef BLOCK_SIZE

550

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

555

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

556

* the output matrix unrolling the values.

557

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

558

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

559

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

560

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

561

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

563

* @note The option -DTRANSPOSE must passed at compile time.

564

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

565

* N0: 2,3,4,8,16

566

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

567

* H0: greater than 0

568

*

569

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

570

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

571

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

573

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

577

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

582

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

583

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

584

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

585

*/

586

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

587

TENSOR3D_DECLARATION(dst))

588

{

589

// Block size

590

#define BLOCK_SIZE ((K0) * (N0))

591

592

// Output offset X

593

#if defined(INTERLEAVE)

594

#define OUTPUT_OFFSET_X (K0)

595

#else // defined(INTERLEAVE)

596

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

597

#endif // defined(INTERLEAVE)

598

599

// Output step X

600

#if defined(INTERLEAVE)

601

#define OUTPUT_STEP_X (K0) * (H0)

602

#else // Do not interleave

603

#define OUTPUT_STEP_X (K0)

604

#endif // defined(INTERLEAVE)

605

606

// Compute source and destination addresses

607

uint x = get_global_id(0);

608

uint y = get_global_id(1);

609

uint z = get_global_id(2);

610

611

// ------------------ Compute input/output addresses ---------------------------

612

613

// Compute the input address

614

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

615

616

// Compute the output address

617

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

618

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

619

620

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

621

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

623

// Load values from the RHS matrix

624

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

625

if(y * (uint)K0 + 1 < SRC_HEIGHT)

626

{

627

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

628

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

629

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

630

if(y * (uint)K0 + 2 < SRC_HEIGHT)

631

{

632

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

633

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

634

#endif // K0 > 2

635

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

636

if(y * (uint)K0 + 3 < SRC_HEIGHT)

637

{

638

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

639

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

640

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

641

#if K0 > 4

642

if(y * (uint)K0 + 4 < SRC_HEIGHT)

643

{

644

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

645

}

646

if(y * (uint)K0 + 5 < SRC_HEIGHT)

647

{

648

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

649

}

650

if(y * (uint)K0 + 6 < SRC_HEIGHT)

651

{

652

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

653

}

654

if(y * (uint)K0 + 7 < SRC_HEIGHT)

655

{

656

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

657

}

658

#endif // K0 > 4

659

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

660

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

661

{

662

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

663

}

664

if(y * (uint)K0 + 9 < SRC_HEIGHT)

665

{

666

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

667

}

668

if(y * (uint)K0 + 10 < SRC_HEIGHT)

669

{

670

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

671

}

672

if(y * (uint)K0 + 11 < SRC_HEIGHT)

673

{

674

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

675

}

676

if(y * (uint)K0 + 12 < SRC_HEIGHT)

677

{

678

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

679

}

680

if(y * (uint)K0 + 13 < SRC_HEIGHT)

681

{

682

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

683

}

684

if(y * (uint)K0 + 14 < SRC_HEIGHT)

685

{

686

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

687

}

688

if(y * (uint)K0 + 15 < SRC_HEIGHT)

689

{

690

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

695

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

696

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

697

#if K0 == 2

698

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

704

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

705

#if N0 > 2

706

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

707

#endif // N0 > 2

708

#if N0 > 3

709

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

710

#endif // N0 > 3

711

#if N0 > 4

712

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

713

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

714

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

715

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

716

#endif // N0 > 4

717

#if N0 > 8

718

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

719

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

720

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

721

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

722

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

723

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

724

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

725

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

726

#endif // N0 > 8

727

728

#elif K0 == 3 // K0 == 2

729

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

734

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

735

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

736

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

737

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

738

#endif // N0 > 2

739

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

740

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

741

#endif // N0 > 3

742

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

743

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

744

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

745

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

746

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

747

#endif // N0 > 4

748

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

749

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

750

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

751

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

752

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

753

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

754

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

755

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

756

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

757

#endif // N0 > 8

758

759

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

760

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

766

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

767

#if N0 > 2

768

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

769

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

770

#if N0 > 3

771

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

772

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

773

#if N0 > 4

774

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

775

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

776

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

777

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

778

#endif // N0 > 4

779

#if N0 > 8

780

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

781

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

782

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

783

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

784

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

785

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

786

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

787

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

788

#endif // N0 > 8

789

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

790

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

791

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

796

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

797

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

798

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

799

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

800

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

801

#if N0 > 3

802

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

803

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

804

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

805

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

806

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

807

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

808

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

809

#endif // N0 > 4

810

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

811

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

812

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

813

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

814

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

815

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

816

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

817

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

818

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

819

#endif // N0 > 8

820

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

821

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

822

823

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

829

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

830

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

831

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

832

#if N0 > 2

833

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

834

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

835

#endif // N0 > 2

836

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

837

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

838

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

839

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

840

#if N0 > 4

841

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

842

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

843

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

844

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

845

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

846

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

847

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

848

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

849

#endif // N0 > 4

850

#if N0 > 8

851

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

852

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

853

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

854

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

855

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

856

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

857

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

858

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

859

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

860

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

861

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

862

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

863

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

864

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

865

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

866

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

871

#endif // N0 > 2

872

873

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

874

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

875

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

876

877

#undef BLOCK_SIZE

878

#undef OUTPUT_OFFSET_X

879

#undef OUTPUT_STEP_X

880

}

881

#endif // defined(TRANSPOSE)

882

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

883

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

884

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

885

886

#define CONCAT(a, b) a##b

887

888

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

893

({ \

894

c = fma(a.s0, b.s0, c); \

895

c = fma(a.s1, b.s1, c); \

896

})

897

#define ARM_DOT3(a, b, c) \

898

({ \

899

ARM_DOT2(a, b, c); \

900

c = fma((a.s2), (b.s2), c); \

901

})

902

#define ARM_DOT4(a, b, c) \

903

({ \

904

ARM_DOT3(a, b, c); \

905

c = fma((a.s3), (b.s3), c); \

906

})

907

#define ARM_DOT8(a, b, c) \

908

({ \

909

ARM_DOT4((a.lo), (b.lo), c); \

910

ARM_DOT4((a.hi), (b.hi), c); \

911

})

912

#define ARM_DOT16(a, b, c) \

913

({ \

914

ARM_DOT8((a.lo), (b.lo), c); \

915

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

})

926

#elif N0 == 3 // N0 == 3

927

#define ARM_DOT_K0XN0(k0, a, b, c) \

928

({ \

929

CONCAT(ARM_DOT, k0) \

930

((a), (b##0), (c.s0)); \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##1), (c.s1)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##2), (c.s2)); \

935

})

936

#elif N0 == 4 // N0 == 4

937

#define ARM_DOT_K0XN0(k0, a, b, c) \

938

({ \

939

CONCAT(ARM_DOT, k0) \

940

((a), (b##0), (c.s0)); \

941

CONCAT(ARM_DOT, k0) \

942

((a), (b##1), (c.s1)); \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##2), (c.s2)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##3), (c.s3)); \

947

})

948

#elif N0 == 8 // N0 == 8

949

#define ARM_DOT_K0XN0(k0, a, b, c) \

950

({ \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##0), (c.s0)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##1), (c.s1)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##2), (c.s2)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##3), (c.s3)); \

959

CONCAT(ARM_DOT, k0) \

960

((a), (b##4), (c.s4)); \

961

CONCAT(ARM_DOT, k0) \

962

((a), (b##5), (c.s5)); \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##6), (c.s6)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##7), (c.s7)); \

967

})

968

#elif N0 == 16 // N0 == 16

969

#define ARM_DOT_K0XN0(k0, a, b, c) \

970

({ \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##0), (c.s0)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##1), (c.s1)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##2), (c.s2)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##3), (c.s3)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##4), (c.s4)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##5), (c.s5)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##6), (c.s6)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##7), (c.s7)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##8), (c.s8)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##9), (c.s9)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##A), (c.sA)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##B), (c.sB)); \

995

CONCAT(ARM_DOT, k0) \

996

((a), (b##C), (c.sC)); \

997

CONCAT(ARM_DOT, k0) \

998

((a), (b##D), (c.sD)); \

999

CONCAT(ARM_DOT, k0) \

1000

((a), (b##E), (c.sE)); \

1001

CONCAT(ARM_DOT, k0) \

1002

((a), (b##F), (c.sF)); \

1003

})

1004

#else // N0 not supported

1005

#error "N0 value not supported"

1006

#endif // N0 conditions

1007

1008

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1009

* The LHS matrix is NOT reshaped

1010

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1011

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1012

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1013

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1014

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1015

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1017

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1019

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1020

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1021

* - N0 = 2, 3, 4, 8, 16

1022

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1023

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1024

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1025

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1026

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1027

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1028

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1029

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1030

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1031

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1032

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1033

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1034

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1035

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1036

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1037

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1038

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1039

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1040

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1042

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1044

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1046

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1047

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1048

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1050

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1052

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1053

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1054

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1055

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1056

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1057

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1058

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1059

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1060

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1062

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1063

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

*/

1065

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1066

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

IMAGE_DECLARATION(bias),

1069

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

IMAGE_DECLARATION(dst),

1071

uint lhs_stride_z,

1072

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1073

#if defined(BETA)

1074

uint bias_stride_z,

1075

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1076

uint dst_stride_z

1077

#if defined(REINTERPRET_INPUT_AS_3D)

1078

,

1079

uint lhs_cross_plane_pad

1080

#endif // REINTERPRET_INPUT_AS_3D

1081

#if defined(REINTERPRET_OUTPUT_AS_3D)

1082

,

1083

uint dst_cross_plane_pad

1084

#endif // REINTERPRET_OUTPUT_AS_3D

1085

)

1086

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1087

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1088

#define RHS_BLOCK_SIZE ((K0) * (N0))

1089

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1090

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1091

#if defined(RHS_INTERLEAVE)

1092

#define RHS_OFFSET_X (K0)

1093

#define RHS_STEP_X ((K0) * (H0))

1094

#define RHS_STEP_LOOP (1)

1095

#else // defined(RHS_INTERLEAVE)

1096

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1097

#define RHS_STEP_X (K0)

1098

#define RHS_STEP_LOOP (H0)

1099

#endif // defined(RHS_INTERLEAVE)

1100

1101

uint x = get_global_id(0);

1102

uint y = get_global_id(1);

1103

uint z = get_global_id(2);

1104

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1105

#if defined(DUMMY_WORK_ITEMS)

1106

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1111

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1112

// Compute LHS matrix address

1113

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1114

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1115

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1116

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1117

1118

#if defined(MATRIX_B_DEPTH)

1119

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1120

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1121

#else // defined(MATRIX_B_DEPTH)

1122

rhs_offset += z * rhs_stride_z;

1123

#endif // defined(MATRIX_B_DEPTH)

1124

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1125

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1126

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1127

1128

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1129

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1130

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1131

1132

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1133

// multiply lhs_stride_z by DEPTH_GEMM3D

1134

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1135

1136

#else // defined(REINTERPRET_INPUT_AS_3D)

1137

1138

// Add offset for batched GEMM

1139

lhs_offset += z * lhs_stride_z;

1140

1141

#endif // defined(REINTERPRET_INPUT_AS_3D)

1142

1143

// Initialize the accumulators

1144

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1145

1146

int i = 0;

1147

for(; i <= (K - K0); i += K0)

1148

{

1149

// Supported cases (M0, K0):

1150

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1151

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1152

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1153

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1154

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1155

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1156

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1157

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1158

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1159

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1160

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1161

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1162

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1163

1164

// Accumulate

1165

ARM_DOT_K0XN0(K0, a0, b, c0);

1166

#if M0 > 1

1167

ARM_DOT_K0XN0(K0, a1, b, c1);

1168

#endif // M0 > 1

1169

#if M0 > 2

1170

ARM_DOT_K0XN0(K0, a2, b, c2);

1171

#endif // M0 > 2

1172

#if M0 > 3

1173

ARM_DOT_K0XN0(K0, a3, b, c3);

1174

#endif // M0 > 3

1175

#if M0 > 4

1176

ARM_DOT_K0XN0(K0, a4, b, c4);

1177

#endif // M0 > 4

1178

#if M0 > 5

1179

ARM_DOT_K0XN0(K0, a5, b, c5);

1180

#endif // M0 > 5

1181

#if M0 > 6

1182

ARM_DOT_K0XN0(K0, a6, b, c6);

1183

#endif // M0 > 6

1184

#if M0 > 7

1185

ARM_DOT_K0XN0(K0, a7, b, c7);

1186

#endif // M0 > 7

1187

1188

lhs_offset += K0 * sizeof(DATA_TYPE);

1189

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1190

}

1191

1192

// Left-over accumulations

1193

for(; i < K; ++i)

1194

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1195

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1196

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1197

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1198

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1199

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1200

1201

// Accumulate

1202

ARM_DOT_K0XN0(1, a0, b, c0);

1203

#if M0 > 1

1204

ARM_DOT_K0XN0(1, a1, b, c1);

1205

#endif // M0 > 1

1206

#if M0 > 2

1207

ARM_DOT_K0XN0(1, a2, b, c2);

1208

#endif // M0 > 2

1209

#if M0 > 3

1210

ARM_DOT_K0XN0(1, a3, b, c3);

1211

#endif // M0 > 3

1212

#if M0 > 4

1213

ARM_DOT_K0XN0(1, a4, b, c4);

1214

#endif // M0 > 4

1215

#if M0 > 5

1216

ARM_DOT_K0XN0(1, a5, b, c5);

1217

#endif // M0 > 5

1218

#if M0 > 6

1219

ARM_DOT_K0XN0(1, a6, b, c6);

1220

#endif // M0 > 6

1221

#if M0 > 7

1222

ARM_DOT_K0XN0(1, a7, b, c7);

1223

#endif // M0 > 7

1224

1225

lhs_offset += sizeof(DATA_TYPE);

1226

rhs_offset += sizeof(DATA_TYPE);

1227

}

1228

1229

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1230

1231

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1232

1233

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1234

1235

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1236

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1237

1238

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1239

// multiply dst_stride_z by DEPTH_GEMM3D

1240

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1241

1242

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1243

1244

// Add offset for batched GEMM

1245

dst_addr += z * dst_stride_z;

1246

1247

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1248

1249

// Multiply by the weight of matrix-matrix product and store the result

1250

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1251

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1252

#endif // defined(ALPHA)

1253

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1254

// Add beta*bias

1255

#if defined(BETA)

1256

#if defined(BROADCAST_BIAS)

1257

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1258

1259

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1260

1261

#ifndef UNIT_BETA

1262

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1263

#endif // UNIT_BIAS

1264

1265

// c = c + bias[broadcasted]

1266

ADD_BLOCK_BROADCAST(M0, c, bias0);

1267

1268

#else // defined(BROADCAST_BIAS)

1269

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1270

2) * bias_stride_z;

1271

1272

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1273

1274

#ifndef UNIT_BETA

1275

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1280

1281

#endif // defined(BROADCAST_BIAS)

1282

#endif // defined(BETA)

1283

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1284

#if defined(ACTIVATION_TYPE)

1285

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1286

#endif // defined(ACTIVATION_TYPE)

1287

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1288

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1289

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1290

1291

#undef RHS_BLOCK_SIZE

1292

#undef RHS_OFFSET_X

1293

#undef RHS_STEP_X

1294

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1295

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1296

#if defined(OPENCL_IMAGE_SUPPORT)

1297

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1298

* The LHS matrix is NOT reshaped

1299

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1300

*

1301

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1302

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1303

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1304

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1305

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1306

* could be different from the value returned by get_image_height(rhs_img).

1307

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1308

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1309

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1310

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1311

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1312

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1318

* The activation function is performed after the bias addition

1319

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1320

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1321

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1322

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1323

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1324

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1325

*

1326

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1327

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1328

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1329

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1330

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1331

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1332

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1333

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1334

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1335

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1336

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1337

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1338

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1339

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1340

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1341

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1342

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1343

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1344

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1345

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1346

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1347

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1348

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1349

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1350

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1351

*/

1352

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1353

__read_only image2d_t rhs_img,

1354

#if defined(BETA)

1355

IMAGE_DECLARATION(bias),

1356

#endif // defined(BETA)

1357

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1363

uint dst_stride_z

1364

#if defined(REINTERPRET_INPUT_AS_3D)

1365

,

1366

uint lhs_cross_plane_pad

1367

#endif // REINTERPRET_INPUT_AS_3D

1368

#if defined(REINTERPRET_OUTPUT_AS_3D)

1369

,

1370

uint dst_cross_plane_pad

1371

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1376

1377

#define LEFTOVER_K (K % K0)

1378

1379

// Block size

1380

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1381

1382

// RHS offset and step X

1383

#if defined(RHS_INTERLEAVE)

1384

#define RHS_OFFSET_X (PIXEL_UNIT)

1385

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1386

#define RHS_STEP_LOOP (1)

1387

#else // defined(RHS_INTERLEAVE)

1388

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1389

#define RHS_STEP_X PIXEL_UNIT

1390

#define RHS_STEP_LOOP (H0)

1391

#endif // defined(RHS_INTERLEAVE)

1392

1393

uint x = get_global_id(0);

1394

uint y = get_global_id(1);

1395

uint z = get_global_id(2);

1396

1397

#if defined(DUMMY_WORK_ITEMS)

1398

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1403

1404

// Compute LHS matrix address

1405

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1406

1407

#if defined(MATRIX_B_DEPTH)

1408

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1409

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1410

#else // defined(MATRIX_B_DEPTH)

1411

const uint z_rhs = get_global_id(2);

1412

#endif // defined(MATRIX_B_DEPTH)

1413

1414

// Compute RHS matrix coordinates

1415

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1416

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1417

1418

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1419

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1420

1421

#if defined(REINTERPRET_INPUT_AS_3D)

1422

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1423

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1424

1425

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1426

// multiply lhs_stride_z by DEPTH_GEMM3D

1427

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1428

1429

#else // defined(REINTERPRET_INPUT_AS_3D)

1430

1431

// Add offset for batched GEMM

1432

lhs_offset += z * lhs_stride_z;

1433

1434

#endif // defined(REINTERPRET_INPUT_AS_3D)

1435

1436

// Initialize the accumulators

1437

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1438

1439

int i = 0;

1440

for(; i <= (K - K0); i += K0)

1441

{

1442

// Load values from LHS matrix

1443

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1444

1445

// Load values from RHS matrix stored in a cl_image

1446

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1447

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1448

1449

// Accumulate

1450

ARM_DOT_K0XN0(K0, a0, b, c0);

1451

#if M0 > 1

1452

ARM_DOT_K0XN0(K0, a1, b, c1);

1453

#endif // M0 > 1

1454

#if M0 > 2

1455

ARM_DOT_K0XN0(K0, a2, b, c2);

1456

#endif // M0 > 2

1457

#if M0 > 3

1458

ARM_DOT_K0XN0(K0, a3, b, c3);

1459

#endif // M0 > 3

1460

#if M0 > 4

1461

ARM_DOT_K0XN0(K0, a4, b, c4);

1462

#endif // M0 > 4

1463

#if M0 > 5

1464

ARM_DOT_K0XN0(K0, a5, b, c5);

1465

#endif // M0 > 5

1466

#if M0 > 6

1467

ARM_DOT_K0XN0(K0, a6, b, c6);

1468

#endif // M0 > 6

1469

#if M0 > 7

1470

ARM_DOT_K0XN0(K0, a7, b, c7);

1471

#endif // M0 > 7

1472

1473

lhs_offset += K0 * sizeof(DATA_TYPE);

1474

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1479

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1489

#if M0 > 1

1490

union UNION_VEC_TYPE a1 = {.v = 0 };

1491

#endif // M0 > 1

1492

#if M0 > 2

1493

union UNION_VEC_TYPE a2 = {.v = 0 };

1494

#endif // M0 > 2

1495

#if M0 > 3

1496

union UNION_VEC_TYPE a3 = {.v = 0 };

1497

#endif // M0 > 3

1498

#if M0 > 4

1499

union UNION_VEC_TYPE a4 = {.v = 0 };

1500

#endif // M0 > 4

1501

#if M0 > 5

1502

union UNION_VEC_TYPE a5 = {.v = 0 };

1503

#endif // M0 > 5

1504

#if M0 > 6

1505

union UNION_VEC_TYPE a6 = {.v = 0 };

1506

#endif // M0 > 6

1507

#if M0 > 7

1508

union UNION_VEC_TYPE a7 = {.v = 0 };

1509

#endif // M0 > 7

1510

1511

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1512

1513

// Load from RHS matrix

1514

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1515

1516

// Load from LHS matrix

1517

for(int k = 0; k < LEFTOVER_K; ++k)

1518

{

1519

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1520

#if M0 > 1

1521

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1522

#endif // M0 > 1

1523

#if M0 > 2

1524

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1525

#endif // M0 > 2

1526

#if M0 > 3

1527

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1528

#endif // M0 > 3

1529

#if M0 > 4

1530

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1531

#endif // M0 > 4

1532

#if M0 > 5

1533

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1534

#endif // M0 > 5

1535

#if M0 > 6

1536

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1537

#endif // M0 > 6

1538

#if M0 > 7

1539

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1540

#endif // M0 > 7

1541

1542

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1547

#if M0 > 1

1548

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1549

#endif // M0 > 1

1550

#if M0 > 2

1551

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1552

#endif // M0 > 2

1553

#if M0 > 3

1554

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1555

#endif // M0 > 3

1556

#if M0 > 4

1557

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1558

#endif // M0 > 4

1559

#if M0 > 5

1560

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1561

#endif // M0 > 5

1562

#if M0 > 6

1563

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1564

#endif // M0 > 6

1565

#if M0 > 7

1566

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1567

#endif // M0 > 7

1568

1569

#endif // LEFTOVER_K != 0

1570

1571

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1572

1573

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1574

1575

#if defined(REINTERPRET_OUTPUT_AS_3D)

1576

1577

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1578

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1579

1580

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1581

// multiply dst_stride_z by DEPTH_GEMM3D

1582

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1583

1584

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1585

1586

// Add offset for batched GEMM

1587

dst_addr += z * dst_stride_z;

1588

1589

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1590

1591

// Multiply by the weight of matrix-matrix product and store the result

1592

#if defined(ALPHA)

1593

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1594

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1599

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1600

1601

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1602

1603

#ifndef UNIT_BETA

1604

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1605

#endif // UNIT_BIAS

1606

1607

// c = c + bias[broadcasted]

1608

ADD_BLOCK_BROADCAST(M0, c, bias0);

1609

1610

#else // defined(BROADCAST_BIAS)

1611

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1612

2) * bias_stride_z;

1613

1614

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1615

1616

#ifndef UNIT_BETA

1617

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1622

1623

#endif // defined(BROADCAST_BIAS)

1624

#endif // defined(BETA)

1625

1626

#if defined(ACTIVATION_TYPE)

1627

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1628

#endif // defined(ACTIVATION_TYPE)

1629

1630

// Store output block

1631

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

1632

1633

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1640

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1641

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1647

#define VFMA_M0xN0(i, a, b, c) \

1648

({ \

1649

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1650

})

1651

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1652

#define VFMA_M0xN0(i, a, b, c) \

1653

({ \

1654

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1655

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1656

})

1657

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1658

#define VFMA_M0xN0(i, a, b, c) \

1659

({ \

1660

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1661

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1663

})

1664

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1665

#define VFMA_M0xN0(i, a, b, c) \

1666

({ \

1667

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1668

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1669

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1670

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1671

})

1672

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1673

#define VFMA_M0xN0(i, a, b, c) \

1674

({ \

1675

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1676

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1677

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1678

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1680

})

1681

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1682

#define VFMA_M0xN0(i, a, b, c) \

1683

({ \

1684

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1685

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1686

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1687

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1690

})

1691

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1692

#define VFMA_M0xN0(i, a, b, c) \

1693

({ \

1694

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1695

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1696

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1697

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1698

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1701

})

1702

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1703

#define VFMA_M0xN0(i, a, b, c) \

1704

({ \

1705

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1706

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1707

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1708

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1709

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1710

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1711

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1712

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1713

})

1714

#else // M0 not supported

1715

#error "M0 not supported"

1716

#endif // M0 not supported

1717

1718

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1719

* The LHS matrix is NOT reshaped

1720

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1721

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1722

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1723

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1724

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1725

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1726

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1727

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1728

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1729

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1730

* - N0 = 2, 3, 4, 8, 16

1731

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1732

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1733

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1734

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1735

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1736

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1737

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1738

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1739

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1740

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1741

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1742

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1743

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1744

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1745

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1746

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1747

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1748

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1749

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1750

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1751

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1752

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1753

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1754

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1755

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1756

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1757

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1758

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1759

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1760

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1761

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1762

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1763

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1764

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1765

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1766

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1767

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1768

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1769

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1770

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1771

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1772

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1773

*/

1774

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1775

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1776

#if defined(BETA)

1777

IMAGE_DECLARATION(bias),

1778

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1779

IMAGE_DECLARATION(dst),

1780

uint lhs_stride_z,

1781

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1782

#if defined(BETA)

1783

uint bias_stride_z,

1784

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1785

uint dst_stride_z

1786

#if defined(REINTERPRET_INPUT_AS_3D)

1787

,

1788

uint lhs_cross_plane_pad

1789

#endif // REINTERPRET_INPUT_AS_3D

1790

#if defined(REINTERPRET_OUTPUT_AS_3D)

1791

,

1792

uint dst_cross_plane_pad

1793

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1798

1799

// RHS offset and step X

1800

#if defined(RHS_INTERLEAVE)

1801

#define RHS_OFFSET_X (N0)

1802

#define RHS_STEP_X ((N0) * (H0))

1803

#define RHS_STEP_LOOP (1)

1804

#else // defined(RHS_INTERLEAVE)

1805

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1806

#define RHS_STEP_X (N0)

1807

#define RHS_STEP_LOOP (H0)

1808

#endif // defined(RHS_INTERLEAVE)

1809

1810

uint x = get_global_id(0);

1811

uint y = get_global_id(1);

1812

uint z = get_global_id(2);

1813

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1814

#if defined(DUMMY_WORK_ITEMS)

1815

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1820

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1821

// Compute LHS matrix address

1822

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1823

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1824

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1825

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1826

1827

#if defined(MATRIX_B_DEPTH)

1828

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1829

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1830

#else // defined(MATRIX_B_DEPTH)

1831

rhs_offset += z * rhs_stride_z;

1832

#endif // defined(MATRIX_B_DEPTH)

1833

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1834

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1835

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1836

1837

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1838

1839

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1840

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1841

1842

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1843

// multiply lhs_stride_z by DEPTH_GEMM3D

1844

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1845

1846

#else // defined(REINTERPRET_INPUT_AS_3D)

1847

1848

// Add offset for batched GEMM

1849

lhs_offset += z * lhs_stride_z;

1850

1851

#endif // defined(REINTERPRET_INPUT_AS_3D)

1852

1853

// Initialize the accumulators

1854

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1855

1856

int i = 0;

1857

for(; i <= (K - K0); i += K0)

1858

{

1859

// Supported cases (M0, K0):

1860

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1861

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1862

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1863

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1864

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1865

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1866

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1867

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1868

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1869

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1870

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1871

VEC_DATA_TYPE(DATA_TYPE, N0)

1872

b0;

1873

1874

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1875

VFMA_M0xN0(0, a, b0, c);

1876

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1877

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1878

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1879

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1880

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1881

#endif // K0 > 2

1882

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1883

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1884

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1885

#endif // K0 > 3

1886

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1887

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1888

VFMA_M0xN0(4, a, b0, c);

1889

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1890

VFMA_M0xN0(5, a, b0, c);

1891

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1892

VFMA_M0xN0(6, a, b0, c);

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1895

#endif // K0 > 4

1896

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1897

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1898

VFMA_M0xN0(8, a, b0, c);

1899

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1900

VFMA_M0xN0(9, a, b0, c);

1901

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1902

VFMA_M0xN0(A, a, b0, c);

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(B, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(C, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(D, a, b0, c);

1909

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1910

VFMA_M0xN0(E, a, b0, c);

1911

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1912

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1913

#endif // K0 > 8

1914

1915

lhs_offset += K0 * sizeof(DATA_TYPE);

1916

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1917

}

1918

1919

// Left-over accumulations

1920

for(; i < K; ++i)

1921

{

1922

// Load values from LHS matrix

1923

VEC_DATA_TYPE(DATA_TYPE, 2)

1924

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1925

#if M0 > 1

1926

VEC_DATA_TYPE(DATA_TYPE, 2)

1927

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1928

#endif // M0 > 1

1929

#if M0 > 2

1930

VEC_DATA_TYPE(DATA_TYPE, 2)

1931

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1932

#endif // M0 > 2

1933

#if M0 > 3

1934

VEC_DATA_TYPE(DATA_TYPE, 2)

1935

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1936

#endif // M0 > 3

1937

#if M0 > 4

1938

VEC_DATA_TYPE(DATA_TYPE, 2)

1939

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1940

#endif // M0 > 4

1941

#if M0 > 5

1942

VEC_DATA_TYPE(DATA_TYPE, 2)

1943

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1944

#endif // M0 > 5

1945

#if M0 > 6

1946

VEC_DATA_TYPE(DATA_TYPE, 2)

1947

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1948

#endif // M0 > 6

1949

#if M0 > 7

1950

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1951

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1952

#endif // M0 > 7

1953

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1954

VEC_DATA_TYPE(DATA_TYPE, N0)

1955

b0;

1956

1957

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1958

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1959

1960

lhs_offset += sizeof(DATA_TYPE);

1961

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1962

}

1963

1964

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1965

1966

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1967

1968

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1969

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1970

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1971

1972

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1973

// multiply dst_stride_z by DEPTH_GEMM3D

1974

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1975

1976

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1977

1978

// Add offset for batched GEMM

1979

dst_addr += z * dst_stride_z;

1980

1981

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1982

1983

// Multiply by the weight of matrix-matrix product and store the result

1984

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1985

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1986

#endif // defined(ALPHA)

1987

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1988

// Add beta*bias

1989

#if defined(BETA)

1990

#if defined(BROADCAST_BIAS)

1991

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1992

1993

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1994

1995

#ifndef UNIT_BETA

1996

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1997

#endif // UNIT_BIAS

1998

1999

// c = c + bias[broadcasted]

2000

ADD_BLOCK_BROADCAST(M0, c, bias0);

2001

2002

#else // defined(BROADCAST_BIAS)

2003

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2004

2) * bias_stride_z;

2005

2006

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2007

2008

#ifndef UNIT_BETA

2009

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2014

2015

#endif // defined(BROADCAST_BIAS)

2016

#endif // defined(BETA)

2017

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2018

#if defined(ACTIVATION_TYPE)

2019

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2020

#endif // defined(ACTIVATION_TYPE)

2021

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2022

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2023

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2024

2025

#undef RHS_BLOCK_SIZE

2026

#undef RHS_OFFSET_X

2027

#undef RHS_STEP_X

2028

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2029

2030

#if defined(OPENCL_IMAGE_SUPPORT)

2031

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2032

* The LHS matrix is NOT reshaped

2033

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2034

*

2035

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2036

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2037

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2038

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2039

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2040

* could be different from the value returned by get_image_height(rhs_img).

2041

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2042

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2043

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2044

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2045

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2046

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2052

* The activation function is performed after the bias addition

2053

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2054

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2055

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2056

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2057

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2058

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2059

*

2060

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2061

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2062

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2063

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2064

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2065

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2066

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2067

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2068

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2069

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2070

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2071

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2072

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2073

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2074

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2075

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2076

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2077

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2078

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2079

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2080

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2081

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2082

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2083

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2084

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2085

*/

2086

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2087

__read_only image2d_t rhs_img,

2088

#if defined(BETA)

2089

IMAGE_DECLARATION(bias),

2090

#endif // defined(BETA)

2091

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2097

uint dst_stride_z

2098

#if defined(REINTERPRET_INPUT_AS_3D)

2099

,

2100

uint lhs_cross_plane_pad

2101

#endif // REINTERPRET_INPUT_AS_3D

2102

#if defined(REINTERPRET_OUTPUT_AS_3D)

2103

,

2104

uint dst_cross_plane_pad

2105

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2110

2111

// Block size

2112

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2113

2114

// RHS offset and step X

2115

#if defined(RHS_INTERLEAVE)

2116

#define RHS_OFFSET_X (PIXEL_UNIT)

2117

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2118

#else // defined(RHS_INTERLEAVE)

2119

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2120

#define RHS_STEP_X (PIXEL_UNIT)

2121

#endif // defined(RHS_INTERLEAVE)

2122

2123

uint x = get_global_id(0);

2124

uint y = get_global_id(1);

2125

uint z = get_global_id(2);

2126

2127

#if defined(DUMMY_WORK_ITEMS)

2128

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2133

2134

// Compute LHS matrix address

2135

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

2136

2137

#if defined(MATRIX_B_DEPTH)

2138

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2139

const uint z_rhs = (z % MATRIX_B_DEPTH);

2140

#else // defined(MATRIX_B_DEPTH)

2141

const uint z_rhs = z;

2142

#endif // defined(MATRIX_B_DEPTH)

2143

2144

// Compute RHS matrix coordinates

2145

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2146

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2147

2148

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2149

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2150

2151

#if defined(REINTERPRET_INPUT_AS_3D)

2152

2153

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2154

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2155

2156

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2157

// multiply lhs_stride_z by DEPTH_GEMM3D

2158

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2159

2160

#else // defined(REINTERPRET_INPUT_AS_3D)

2161

2162

// Add offset for batched GEMM

2163

lhs_offset += z * lhs_stride_z;

2164

2165

#endif // defined(REINTERPRET_INPUT_AS_3D)

2166

2167

// Initialize the accumulators

2168

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2169

2170

int i = 0;

2171

for(; i <= (K - K0); i += K0)

2172

{

2173

// Load values from LHS matrix

2174

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2175

2176

VEC_DATA_TYPE(DATA_TYPE, N0)

2177

b0;

2178

2179

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2180

VFMA_M0xN0(0, a, b0, c);

2181

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2182

VFMA_M0xN0(1, a, b0, c);

2183

#if K0 > 2

2184

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2185

VFMA_M0xN0(2, a, b0, c);

2186

#endif // K0 > 2

2187

#if K0 > 3

2188

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2189

VFMA_M0xN0(3, a, b0, c);

2190

#endif // K0 > 3

2191

#if K0 > 4

2192

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2193

VFMA_M0xN0(4, a, b0, c);

2194

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2195

VFMA_M0xN0(5, a, b0, c);

2196

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2197

VFMA_M0xN0(6, a, b0, c);

2198

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2199

VFMA_M0xN0(7, a, b0, c);

2200

#endif // K0 > 4

2201

#if K0 > 8

2202

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2203

VFMA_M0xN0(8, a, b0, c);

2204

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2205

VFMA_M0xN0(9, a, b0, c);

2206

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2207

VFMA_M0xN0(A, a, b0, c);

2208

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2209

VFMA_M0xN0(B, a, b0, c);

2210

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2211

VFMA_M0xN0(C, a, b0, c);

2212

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2213

VFMA_M0xN0(D, a, b0, c);

2214

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2215

VFMA_M0xN0(E, a, b0, c);

2216

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2217

VFMA_M0xN0(F, a, b0, c);

2218

#endif // K0 > 8

2219

2220

lhs_offset += K0 * sizeof(DATA_TYPE);

2221

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2222

}

2223

2224

// Left-over accumulations

2225

for(; i < K; ++i)

2226

{

2227

// Load values from LHS matrix

2228

VEC_DATA_TYPE(DATA_TYPE, 2)

2229

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2230

#if M0 > 1

2231

VEC_DATA_TYPE(DATA_TYPE, 2)

2232

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2233

#endif // M0 > 1

2234

#if M0 > 2

2235

VEC_DATA_TYPE(DATA_TYPE, 2)

2236

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2237

#endif // M0 > 2

2238

#if M0 > 3

2239

VEC_DATA_TYPE(DATA_TYPE, 2)

2240

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2241

#endif // M0 > 3

2242

#if M0 > 4

2243

VEC_DATA_TYPE(DATA_TYPE, 2)

2244

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2245

#endif // M0 > 4

2246

#if M0 > 5

2247

VEC_DATA_TYPE(DATA_TYPE, 2)

2248

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2249

#endif // M0 > 5

2250

#if M0 > 6

2251

VEC_DATA_TYPE(DATA_TYPE, 2)

2252

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2253

#endif // M0 > 6

2254

#if M0 > 7

2255

VEC_DATA_TYPE(DATA_TYPE, 2)

2256

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2257

#endif // M0 > 7

2258

2259

VEC_DATA_TYPE(DATA_TYPE, N0)

2260

b0;

2261

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2262

2263

VFMA_M0xN0(0, a, b0, c);

2264

2265

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2270

2271

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2272

2273

#if defined(REINTERPRET_OUTPUT_AS_3D)

2274

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2275

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2276

2277

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2278

// multiply dst_stride_z by DEPTH_GEMM3D

2279

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2280

2281

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2282

2283

// Add offset for batched GEMM

2284

dst_addr += z * dst_stride_z;

2285

2286

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2287

2288

// Multiply by the weight of matrix-matrix product and store the result

2289

#if defined(ALPHA)

2290

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2291

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2296

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2297

2298

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2299

2300

#ifndef UNIT_BETA

2301

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2302

#endif // UNIT_BIAS

2303

2304

// c = c + bias[broadcasted]

2305

ADD_BLOCK_BROADCAST(M0, c, bias0);

2306

2307

#else // defined(BROADCAST_BIAS)

2308

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2309

2) * bias_stride_z;

2310

2311

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2312

2313

#ifndef UNIT_BETA

2314

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2319

2320

#endif // defined(BROADCAST_BIAS)

2321

#endif // defined(BETA)

2322

2323

#if defined(ACTIVATION_TYPE)

2324

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2325

#endif // defined(ACTIVATION_TYPE)

2326

2327

// Store output block

2328

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2329

2330

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2335

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2336

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2337

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2338

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2339

#if defined(MIXED_PRECISION)

2340

#if K0 == 2

2341

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2347

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2354

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2362

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2374

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2394

#error "K0 value not supported"

2395

#endif // K0 conditions

2396

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2397

#if K0 == 2

2398

#define ARM_DOT_K0(a, b, c) \

2399

({ \

2400

c = fma(a.s0, b.s0, c); \

2401

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2402

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2403

#elif K0 == 3 // K0 == 3

2404

#define ARM_DOT_K0(a, b, c) \

2405

({ \

2406

c = fma(a.s0, b.s0, c); \

2407

c = fma(a.s1, b.s1, c); \

2408

c = fma(a.s2, b.s2, c); \

2409

})

2410

#elif K0 == 4 // K0 == 4

2411

#define ARM_DOT_K0(a, b, c) \

2412

({ \

2413

c = fma(a.s0, b.s0, c); \

2414

c = fma(a.s1, b.s1, c); \

2415

c = fma(a.s2, b.s2, c); \

2416

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2417

})

2418

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2419

#define ARM_DOT_K0(a, b, c) \

2420

({ \

2421

c = fma(a.s0, b.s0, c); \

2422

c = fma(a.s1, b.s1, c); \

2423

c = fma(a.s2, b.s2, c); \

2424

c = fma(a.s3, b.s3, c); \

2425

c = fma(a.s4, b.s4, c); \

2426

c = fma(a.s5, b.s5, c); \

2427

c = fma(a.s6, b.s6, c); \

2428

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2429

})

2430

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2431

#define ARM_DOT_K0(a, b, c) \

2432

({ \

2433

c = fma(a.s0, b.s0, c); \

2434

c = fma(a.s1, b.s1, c); \

2435

c = fma(a.s2, b.s2, c); \

2436

c = fma(a.s3, b.s3, c); \

2437

c = fma(a.s4, b.s4, c); \

2438

c = fma(a.s5, b.s5, c); \

2439

c = fma(a.s6, b.s6, c); \

2440

c = fma(a.s7, b.s7, c); \

2441

c = fma(a.s8, b.s8, c); \

2442

c = fma(a.s9, b.s9, c); \

2443

c = fma(a.sA, b.sA, c); \

2444

c = fma(a.sB, b.sB, c); \

2445

c = fma(a.sC, b.sC, c); \

2446

c = fma(a.sD, b.sD, c); \

2447

c = fma(a.sE, b.sE, c); \

2448

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2449

})

2450

#else // K0 not supported

2451

#error "K0 value not supported"

2452

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2453

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2454

2455

#if N0 == 2

2456

#define ARM_DOT_K0XN0(a, b, c) \

2457

({ \

2458

ARM_DOT_K0((a), (b##0), (c.s0)); \

2459

ARM_DOT_K0((a), (b##1), (c.s1)); \

2460

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2461

#elif N0 == 3 // N0 == 3

2462

#define ARM_DOT_K0XN0(a, b, c) \

2463

({ \

2464

ARM_DOT_K0((a), (b##0), (c.s0)); \

2465

ARM_DOT_K0((a), (b##1), (c.s1)); \

2466

ARM_DOT_K0((a), (b##2), (c.s2)); \

2467

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2468

#elif N0 == 4 // N0 == 4

2469

#define ARM_DOT_K0XN0(a, b, c) \

2470

({ \

2471

ARM_DOT_K0((a), (b##0), (c.s0)); \

2472

ARM_DOT_K0((a), (b##1), (c.s1)); \

2473

ARM_DOT_K0((a), (b##2), (c.s2)); \

2474

ARM_DOT_K0((a), (b##3), (c.s3)); \

2475

})

2476

#elif N0 == 8 // N0 == 8

2477

#define ARM_DOT_K0XN0(a, b, c) \

2478

({ \

2479

ARM_DOT_K0((a), (b##0), (c.s0)); \

2480

ARM_DOT_K0((a), (b##1), (c.s1)); \

2481

ARM_DOT_K0((a), (b##2), (c.s2)); \

2482

ARM_DOT_K0((a), (b##3), (c.s3)); \

2483

ARM_DOT_K0((a), (b##4), (c.s4)); \

2484

ARM_DOT_K0((a), (b##5), (c.s5)); \

2485

ARM_DOT_K0((a), (b##6), (c.s6)); \

2486

ARM_DOT_K0((a), (b##7), (c.s7)); \

2487

})

2488

#elif N0 == 16 // N0 == 16

2489

#define ARM_DOT_K0XN0(a, b, c) \

2490

({ \

2491

ARM_DOT_K0((a), (b##0), (c.s0)); \

2492

ARM_DOT_K0((a), (b##1), (c.s1)); \

2493

ARM_DOT_K0((a), (b##2), (c.s2)); \

2494

ARM_DOT_K0((a), (b##3), (c.s3)); \

2495

ARM_DOT_K0((a), (b##4), (c.s4)); \

2496

ARM_DOT_K0((a), (b##5), (c.s5)); \

2497

ARM_DOT_K0((a), (b##6), (c.s6)); \

2498

ARM_DOT_K0((a), (b##7), (c.s7)); \

2499

ARM_DOT_K0((a), (b##8), (c.s8)); \

2500

ARM_DOT_K0((a), (b##9), (c.s9)); \

2501

ARM_DOT_K0((a), (b##A), (c.sA)); \

2502

ARM_DOT_K0((a), (b##B), (c.sB)); \

2503

ARM_DOT_K0((a), (b##C), (c.sC)); \

2504

ARM_DOT_K0((a), (b##D), (c.sD)); \

2505

ARM_DOT_K0((a), (b##E), (c.sE)); \

2506

ARM_DOT_K0((a), (b##F), (c.sF)); \

2507

})

2508

#else // N0 not supported

2509

#error "N0 value not supported"

2510

#endif // N0 conditions

2511

2512

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2513

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2514

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2515

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2516

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2517

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2518

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2519

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2520

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2521

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2522

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2523

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2524

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2525

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2526

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2527

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2528

* - N0 = 2, 3, 4, 8, 16

2529

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2530

* - V0 >= 1

2531

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2532

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2533

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2534

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2535

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2536

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2537

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2538

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2539

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2540

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2541

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2542

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2543

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2544

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2545

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2546

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2547

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2548

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2549

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2550

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2551

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2552

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2553

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2554

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2555

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2556

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2557

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2558

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2559

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2560

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2561

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2562

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2563

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2564

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2565

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2566

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2567

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2568

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2569

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2570

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2571

*/

2572

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2573

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2574

#if defined(BETA)

2575

IMAGE_DECLARATION(bias),

2576

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2577

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2578

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2579

uint lhs_stride_z,

2580

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2581

#if defined(BETA)

2582

uint bias_stride_z,

2583

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2584

uint dst_stride_z

2585

#if defined(REINTERPRET_OUTPUT_AS_3D)

2586

,

2587

uint dst_cross_plane_pad

2588

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2593

2594

#if defined(LHS_INTERLEAVE)

2595

#define LHS_OFFSET_X (K0)

2596

#define LHS_STEP_X ((K0) * (V0))

2597

#define LHS_STEP_LOOP (1)

2598

#else // defined(INTERLEAVE)

2599

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2600

#define LHS_STEP_X (K0)

2601

#define LHS_STEP_LOOP (V0)

2602

#endif // defined(INTERLEAVE)

2603

2604

// Block size

2605

#define RHS_BLOCK_SIZE ((K0) * (N0))

2606

2607

// RHS offset and step X

2608

#if defined(RHS_INTERLEAVE)

2609

#define RHS_OFFSET_X (K0)

2610

#define RHS_STEP_X ((K0) * (H0))

2611

#define RHS_STEP_LOOP (1)

2612

#else // defined(RHS_INTERLEAVE)

2613

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2614

#define RHS_STEP_X (K0)

2615

#define RHS_STEP_LOOP (H0)

2616

#endif // defined(RHS_INTERLEAVE)

2617

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2618

#if defined(DUMMY_WORK_ITEMS)

2619

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2624

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2625

// Compute LHS matrix address

2626

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2627

(get_global_id(2) * lhs_stride_z);

2628

2629

// Compute RHS matrix address

2630

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2631

2632

#if defined(MATRIX_B_DEPTH)

2633

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2634

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2635

#else // defined(MATRIX_B_DEPTH)

2636

rhs_addr += get_global_id(2) * rhs_stride_z;

2637

#endif // defined(MATRIX_B_DEPTH)

2638

2639

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2640

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2641

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2642

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2643

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2644

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2645

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2646

{

2647

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2648

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2649

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2650

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2651

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2652

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2653

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2654

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2655

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2656

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2657

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2658

2659

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2660

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2661

2662

// Accumulate

2663

ARM_DOT_K0XN0(a0, b, c0);

2664

#if M0 > 1

2665

ARM_DOT_K0XN0(a1, b, c1);

2666

#endif // M0 > 1

2667

#if M0 > 2

2668

ARM_DOT_K0XN0(a2, b, c2);

2669

#endif // M0 > 2

2670

#if M0 > 3

2671

ARM_DOT_K0XN0(a3, b, c3);

2672

#endif // M0 > 3

2673

#if M0 > 4

2674

ARM_DOT_K0XN0(a4, b, c4);

2675

#endif // M0 > 4

2676

#if M0 > 5

2677

ARM_DOT_K0XN0(a5, b, c5);

2678

#endif // M0 > 5

2679

#if M0 > 6

2680

ARM_DOT_K0XN0(a6, b, c6);

2681

#endif // M0 > 6

2682

#if M0 > 7

2683

ARM_DOT_K0XN0(a7, b, c7);

2684

#endif // M0 > 7

2685

2686

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2687

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2688

}

2689

2690

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2691

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2692

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2693

2694

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2695

2696

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2697

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2698

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2699

// multiply dst_stride_z by DEPTH_GEMM3D

2700

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2701

2702

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2703

2704

// Add offset for batched GEMM

2705

dst_addr += get_global_id(2) * dst_stride_z;

2706

2707

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2708

2709

// Multiply by the weight of matrix-matrix product and store the result

2710

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2711

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2712

#endif // defined(ALPHA)

2713

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2714

// Add beta*bias

2715

#if defined(BETA)

2716

#if defined(BROADCAST_BIAS)

2717

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2718

2719

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2720

2721

#ifndef UNIT_BETA

2722

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2723

#endif // UNIT_BIAS

2724

2725

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2726

#if defined(MIXED_PRECISION)

2727

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2728

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2729

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2730

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2731

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2732

2733

#else // defined(BROADCAST_BIAS)

2734

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2735

2) * bias_stride_z;

2736

2737

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2738

2739

#ifndef UNIT_BETA

2740

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2741

#endif // UNIT_BIAS

2742

2743

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2744

#if defined(MIXED_PRECISION)

2745

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2746

ADD_BLOCK(M0, c, bias_hp);

2747

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2748

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2749

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2750

2751

#endif // defined(BROADCAST_BIAS)

2752

#endif // defined(BETA)

2753

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2754

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2755

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2756

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2757

#else // defined(MIXED_PRECISION)

2758

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2759

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2760

#endif // defined(ACTIVATION_TYPE)

2761

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2762

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2763

#if defined(MIXED_PRECISION)

2764

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2765

#else // defined(MIXED_PRECISION)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2766

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2767

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2768

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2769

#undef LHS_BLOCK_SIZE

2770

#undef LHS_OFFSET_X

2771

#undef LHS_STEP_X

2772

#undef RHS_BLOCK_SIZE

2773

#undef RHS_OFFSET_X

2774

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2775

#undef LHS_STEP_LOOP

2776

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2777

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2778

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2779

#if defined(OPENCL_IMAGE_SUPPORT)

2780

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2781

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2782

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2783

*

2784

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2785

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2786

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2787

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2788

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2789

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2790

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2791

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2792

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2793

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2794

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2795

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2796

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2797

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2798

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2799

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2806

* The activation function is performed after the bias addition

2807

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2808

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2809

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2810

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2811

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2812

*

2813

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2814

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2815

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2816

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2817

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2818

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2819

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2820

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2821

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2822

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2823

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2824

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2825

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2826

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2827

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2828

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2829

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2830

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2831

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2832

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2833

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2834

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2835

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2836

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2837

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2838

*/

2839

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2840

__read_only image2d_t rhs_img,

2841

#if defined(BETA)

2842

IMAGE_DECLARATION(bias),

2843

#endif // defined(BETA)

2844

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2845

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2851

uint dst_stride_z

2852

#if defined(REINTERPRET_OUTPUT_AS_3D)

2853

,

2854

uint dst_cross_plane_pad

2855

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2860

2861

// Block size

2862

#define LHS_BLOCK_SIZE ((K0) * (M0))

2863

2864

#if defined(LHS_INTERLEAVE)

2865

#define LHS_OFFSET_X (K0)

2866

#define LHS_STEP_X ((K0) * (V0))

2867

#define LHS_STEP_LOOP (1)

2868

#else // defined(INTERLEAVE)

2869

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2870

#define LHS_STEP_X (K0)

2871

#define LHS_STEP_LOOP (V0)

2872

#endif // defined(INTERLEAVE)

2873

2874

// Block size

2875

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2876

2877

// RHS offset and step X

2878

#if defined(RHS_INTERLEAVE)

2879

#define RHS_OFFSET_X (PIXEL_UNIT)

2880

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2881

#define RHS_STEP_LOOP (1)

2882

#else // defined(RHS_INTERLEAVE)

2883

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2884

#define RHS_STEP_X PIXEL_UNIT

2885

#define RHS_STEP_LOOP (H0)

2886

#endif // defined(RHS_INTERLEAVE)

2887

2888

#if defined(DUMMY_WORK_ITEMS)

2889

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2894

2895

// Compute LHS matrix address

2896

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2897

(get_global_id(2) * lhs_stride_z);

2898

2899

#if defined(MATRIX_B_DEPTH)

2900

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2901

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2902

#else // defined(MATRIX_B_DEPTH)

2903

const uint z_rhs = get_global_id(2);

2904

#endif // defined(MATRIX_B_DEPTH)

2905

2906

// Compute RHS matrix coordinates

2907

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2908

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2909

2910

// Initialize the accumulators

2911

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2912

2913

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2914

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2915

2916

for(int i = 0; i < K; i += K0)

2917

{

2918

// Load values from LHS matrix

2919

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2920

2921

// Load values from RHS matrix stored in a cl_image

2922

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2923

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2924

2925

// Accumulate

2926

ARM_DOT_K0XN0(a0, b, c0);

2927

#if M0 > 1

2928

ARM_DOT_K0XN0(a1, b, c1);

2929

#endif // M0 > 1

2930

#if M0 > 2

2931

ARM_DOT_K0XN0(a2, b, c2);

2932

#endif // M0 > 2

2933

#if M0 > 3

2934

ARM_DOT_K0XN0(a3, b, c3);

2935

#endif // M0 > 3

2936

#if M0 > 4

2937

ARM_DOT_K0XN0(a4, b, c4);

2938

#endif // M0 > 4

2939

#if M0 > 5

2940

ARM_DOT_K0XN0(a5, b, c5);

2941

#endif // M0 > 5

2942

#if M0 > 6

2943

ARM_DOT_K0XN0(a6, b, c6);

2944

#endif // M0 > 6

2945

#if M0 > 7

2946

ARM_DOT_K0XN0(a7, b, c7);

2947

#endif // M0 > 7

2948

2949

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2950

2951

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2952

}

2953

2954

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2955

2956

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2957

2958

#if defined(REINTERPRET_OUTPUT_AS_3D)

2959

2960

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2961

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2962

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2963

// multiply dst_stride_z by DEPTH_GEMM3D

2964

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2965

2966

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2967

2968

// Add offset for batched GEMM

2969

dst_addr += get_global_id(2) * dst_stride_z;

2970

2971

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2972

2973

// Multiply by the weight of matrix-matrix product and store the result

2974

#if defined(ALPHA)

2975

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2976

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2981

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2982

2983

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2984

2985

#ifndef UNIT_BETA

2986

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2987

#endif // UNIT_BIAS

2988

2989

// c = c + bias[broadcasted]

2990

#if defined(MIXED_PRECISION)

2991

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2992

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2993

#else // defined(MIXED_PRECISION)

2994

ADD_BLOCK_BROADCAST(M0, c, bias0);

2995

#endif // defined(MIXED_PRECISION)

2996

2997

#else // defined(BROADCAST_BIAS)

2998

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2999

2) * bias_stride_z;

3000

3001

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3002

3003

#ifndef UNIT_BETA

3004

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3009

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3010

ADD_BLOCK(M0, c, bias_hp);

3011

#else // defined(MIXED_PRECISION)

3012

ADD_BLOCK(M0, c, bias);

3013

#endif // defined(MIXED_PRECISION)

3014

3015

#endif // defined(BROADCAST_BIAS)

3016

#endif // defined(BETA)

3017

3018

#if defined(ACTIVATION_TYPE)

3019

#if defined(MIXED_PRECISION)

3020

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3021

#else // defined(MIXED_PRECISION)

3022

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3023

#endif // defined(MIXED_PRECISION)

3024

#endif // defined(ACTIVATION_TYPE)

3025

3026

// Store output block

3027

#if defined(MIXED_PRECISION)

3028

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3029

#else // defined(MIXED_PRECISION)

3030

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3031

#endif // defined(MIXED_PRECISION)

3032

3033

#undef LHS_BLOCK_SIZE

3034

#undef LHS_OFFSET_X

3035

#undef LHS_STEP_X

3036

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3044

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3045

#if defined(LHS_TRANSPOSE)

3046

3047

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3048

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3049

#if defined(MIXED_PRECISION)

3050

3051

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3052

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3053

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3054

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3055

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3056

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3057

#else // defined(MIXED_PRECISION

3058

3059

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3060

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3061

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3062

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3063

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3064

3065

#endif // defined(MIXED_PRECISION)

3066

3067

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3068

({ \

3069

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3070

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3071

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3072

({ \

3073

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3074

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3075

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3076

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3077

({ \

3078

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3079

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3080

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3081

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3082

({ \

3083

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3084

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3085

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3086

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3087

({ \

3088

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3089

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3090

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3091

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3092

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3093

})

3094

3095

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3096

// a is the column-vector (transposed)

3097

// b is the row-vector (not transposed)

3098

// C is the output matrix

3099

// Lower case is a vector (a, b)

3100

// Upper case is a matrix (C)

3101

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3102

3103

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3104

({ \

3105

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3106

})

3107

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3108

({ \

3109

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3110

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3111

})

3112

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3113

({ \

3114

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3115

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3116

})

3117

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3118

({ \

3119

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3120

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3121

})

3122

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3123

({ \

3124

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3125

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3126

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3127

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3128

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3129

})

3130

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3131

({ \

3132

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3133

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3134

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3135

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3136

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3137

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3138

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3139

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3140

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3141

})

3142

3143

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3144

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3145

// The dimensions supported are:

3146

// M0: 1, 2, 3, 4, 8

3147

// N0: 1, 2, 3, 4, 8, 16

3148

// K0: 1, 2, 3, 4, 8, 16

3149

// This macro calls the vector-by-matrix macro K0 times

3150

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3151

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3152

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3153

(M0, N0, TYPE, A, B, C)

3154

3155

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3156

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3157

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3158

*

3159

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3160

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3161

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3162

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3163

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3164

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3165

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3166

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3167

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3168

* - M0 = 2, 3, 4, 8

3169

* - N0 = 2, 3, 4, 8, 16

3170

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3175

* The activation function is performed after the bias addition

3176

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3177

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3178

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3179

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3180

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3181

*

3182

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3183

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3184

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3185

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3186

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3187

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3188

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3189

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3190

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3191

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3192

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3193

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3194

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3195

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3196

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3197

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3198

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3199

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3200

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3201

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3202

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3203

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3204

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3205

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3206

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3207

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3208

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3209

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3210

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3211

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3212

*/

3213

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3214

IMAGE_DECLARATION(rhs),

3215

#if defined(BETA)

3216

IMAGE_DECLARATION(bias),

3217

#endif // defined(BETA)

3218

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3219

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3225

uint dst_stride_z

3226

#if defined(REINTERPRET_OUTPUT_AS_3D)

3227

,

3228

uint dst_cross_plane_pad

3229

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3234

3235

#if defined(LHS_INTERLEAVE)

3236

#define LHS_OFFSET_X (M0)

3237

#define LHS_STEP_X ((M0) * (V0))

3238

#define LHS_STEP_LOOP (1)

3239

#else // defined(INTERLEAVE)

3240

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3241

#define LHS_STEP_X (M0)

3242

#define LHS_STEP_LOOP (V0)

3243

#endif // defined(INTERLEAVE)

3244

3245

// Block size

3246

#define RHS_BLOCK_SIZE ((K0) * (N0))

3247

3248

// RHS offset and step X

3249

#if defined(RHS_INTERLEAVE)

3250

#define RHS_OFFSET_X (N0)

3251

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3252

#else // defined(RHS_INTERLEAVE)

3253

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3254

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3255

#endif // defined(RHS_INTERLEAVE)

3256

3257

const uint x = get_global_id(0);

3258

const uint y = get_global_id(1);

3259

const uint z = get_global_id(2);

3260

3261

#if defined(DUMMY_WORK_ITEMS)

3262

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3267

3268

// Compute LHS matrix address

3269

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3270

3271

// Compute RHS matrix address

3272

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3273

3274

#if defined(MATRIX_B_DEPTH)

3275

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3276

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3277

#else // defined(MATRIX_B_DEPTH)

3278

rhs_addr += z * rhs_stride_z;

3279

#endif // defined(MATRIX_B_DEPTH)

3280

3281

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3282

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3283

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3284

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3285

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3286

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3287

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3288

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3289

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3290

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3291

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3292

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3293

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3294

b0;

3295

3296

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3297

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3298

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3299

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3300

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3301

lhs += LHS_STEP_X;

3302

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3303

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3304

#if K0 > 1

3305

a0 = VLOAD(M0)(0, lhs);

3306

b0 = VLOAD(N0)(0, rhs);

3307

3308

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3316

b0 = VLOAD(N0)(0, rhs);

3317

3318

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3326

b0 = VLOAD(N0)(0, rhs);

3327

3328

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3336

b0 = VLOAD(N0)(0, rhs);

3337

3338

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3344

b0 = VLOAD(N0)(0, rhs);

3345

3346

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3352

b0 = VLOAD(N0)(0, rhs);

3353

3354

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3360

b0 = VLOAD(N0)(0, rhs);

3361

3362

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3370

b0 = VLOAD(N0)(0, rhs);

3371

3372

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3378

b0 = VLOAD(N0)(0, rhs);

3379

3380

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3386

b0 = VLOAD(N0)(0, rhs);

3387

3388

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3394

b0 = VLOAD(N0)(0, rhs);

3395

3396

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3402

b0 = VLOAD(N0)(0, rhs);

3403

3404

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3410

b0 = VLOAD(N0)(0, rhs);

3411

3412

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3418

b0 = VLOAD(N0)(0, rhs);

3419

3420

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3426

b0 = VLOAD(N0)(0, rhs);

3427

3428

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3435

lhs += (M0 * K0 * (V0 - 1));

3436

#endif // LHS_INTERLEAVE

3437

3438

#ifndef RHS_INTERLEAVE

3439

rhs += (N0 * K0 * (H0 - 1));

3440

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3441

}

3442

3443

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3444

3445

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3446

3447

#if defined(REINTERPRET_OUTPUT_AS_3D)

3448

3449

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3450

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3451

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3452

// multiply dst_stride_z by DEPTH_GEMM3D

3453

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3454

3455

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3456

3457

// Add offset for batched GEMM

3458

dst_addr += z * dst_stride_z;

3459

3460

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3461

3462

// Multiply by the weight of matrix-matrix product and store the result

3463

#if defined(ALPHA)

3464

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3465

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3470

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3471

3472

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3473

3474

#ifndef UNIT_BETA

3475

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3476

#endif // UNIT_BIAS

3477

3478

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3479

#if defined(MIXED_PRECISION)

3480

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3481

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3482

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3483

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3484

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3485

3486

#else // defined(BROADCAST_BIAS)

3487

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3488

3489

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3490

3491

#ifndef UNIT_BETA

3492

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3493

#endif // UNIT_BIAS

3494

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3495

#if defined(MIXED_PRECISION)

3496

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3497

ADD_BLOCK(M0, c, bias_hp);

3498

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3499

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3500

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3501

3502

#endif // defined(BROADCAST_BIAS)

3503

#endif // defined(BETA)

3504

3505

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3506

#if defined(MIXED_PRECISION)

3507

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3508

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3509

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3510

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3511

#endif // defined(ACTIVATION_TYPE)

3512

3513

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3514

#if defined(MIXED_PRECISION)

3515

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3516

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3517

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3518

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3519

3520

#undef LHS_BLOCK_SIZE

3521

#undef LHS_OFFSET_X

3522

#undef LHS_STEP_X

3523

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3528

#if defined(OPENCL_IMAGE_SUPPORT)

3529

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3530

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3531

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3532

*

3533

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3534

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3535

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3536

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3537

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3538

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3539

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3540

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3541

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3542

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3543

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3544

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3545

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3553

* The activation function is performed after the bias addition

3554

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3555

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3556

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3557

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3558

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3559

*

3560

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3561

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3562

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3563

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3564

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3565

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3566

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3567

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3568

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3569

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3570

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3571

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3572

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3573

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3574

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3575

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3576

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3577

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3578

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3579

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3580

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3581

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3582

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3583

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3584

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3585

*/

3586

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3587

__read_only image2d_t rhs_img,

3588

#if defined(BETA)

3589

IMAGE_DECLARATION(bias),

3590

#endif // defined(BETA)

3591

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3592

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3598

uint dst_stride_z

3599

#if defined(REINTERPRET_OUTPUT_AS_3D)

3600

,

3601

uint dst_cross_plane_pad

3602

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3607

3608

// Block size

3609

#define LHS_BLOCK_SIZE ((K0) * (M0))

3610

3611

#if defined(LHS_INTERLEAVE)

3612

#define LHS_OFFSET_X (M0)

3613

#define LHS_STEP_X ((M0) * (V0))

3614

#define LHS_STEP_LOOP (1)

3615

#else // defined(INTERLEAVE)

3616

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3617

#define LHS_STEP_X (M0)

3618

#define LHS_STEP_LOOP (V0)

3619

#endif // defined(INTERLEAVE)

3620

3621

// Block size

3622

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3623

3624

// RHS offset and step X

3625

#if defined(RHS_INTERLEAVE)

3626

#define RHS_OFFSET_X (PIXEL_UNIT)

3627

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3628

#else // defined(RHS_INTERLEAVE)

3629

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3630

#define RHS_STEP_X (PIXEL_UNIT)

3631

#endif // defined(RHS_INTERLEAVE)

3632

3633

const uint x = get_global_id(0);

3634

const uint y = get_global_id(1);

3635

const uint z = get_global_id(2);

3636

3637

#if defined(DUMMY_WORK_ITEMS)

3638

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3643

3644

// Compute LHS matrix address

3645

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3646

3647

#if defined(MATRIX_B_DEPTH)

3648

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3649

const uint z_rhs = (z % MATRIX_B_DEPTH);

3650

#else // defined(MATRIX_B_DEPTH)

3651

const uint z_rhs = z;

3652

#endif // defined(MATRIX_B_DEPTH)

3653

3654

// Compute RHS matrix coordinates

3655

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3656

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3657

3658

// Initialize the accumulators

3659

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3660

3661

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3662

3663

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3664

3665

for(int i = 0; i < K; i += K0)

3666

{

3667

VEC_DATA_TYPE(DATA_TYPE, M0)

3668

a0;

3669

VEC_DATA_TYPE(DATA_TYPE, N0)

3670

b0;

3671

3672

a0 = VLOAD(M0)(0, lhs);

3673

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3674

3675

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3681

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3682

3683

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3690

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3691

3692

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3699

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3700

3701

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3708

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3709

3710

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3715

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3716

3717

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3722

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3723

3724

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3729

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3730

3731

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3738

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3739

3740

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3745

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3746

3747

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3752

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3753

3754

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3759

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3760

3761

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3766

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3767

3768

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3773

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3774

3775

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3780

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3781

3782

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3787

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3788

3789

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3795

lhs += (M0 * K0 * (V0 - 1));

3796

#endif // LHS_INTERLEAVE

3797

3798

x_rhs += K0 * RHS_STEP_X;

3799

#ifndef RHS_INTERLEAVE

3800

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3801

#endif // RHS_INTERLEAVE

3802

}

3803

3804

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3805

3806

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3807

3808

#if defined(REINTERPRET_OUTPUT_AS_3D)

3809

3810

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3811

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3812

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3813

// multiply dst_stride_z by DEPTH_GEMM3D

3814

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3815

3816

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3817

3818

// Add offset for batched GEMM

3819

dst_addr += z * dst_stride_z;

3820

3821

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3822

3823

// Multiply by the weight of matrix-matrix product and store the result

3824

#if defined(ALPHA)

3825

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3826

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3831

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3832

3833

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3834

3835

#ifndef UNIT_BETA

3836

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3837

#endif // UNIT_BIAS

3838

3839

// c = c + bias[broadcasted]

3840

#if defined(MIXED_PRECISION)

3841

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3842

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3843

#else // defined(MIXED_PRECISION)

3844

ADD_BLOCK_BROADCAST(M0, c, bias0);

3845

#endif // defined(MIXED_PRECISION)

3846

3847

#else // defined(BROADCAST_BIAS)

3848

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3849

3850

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3851

3852

#ifndef UNIT_BETA

3853

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3854

#endif // UNIT_BIAS

3855

3856

#if defined(MIXED_PRECISION)

3857

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3858

ADD_BLOCK(M0, c, bias_hp);

3859

#else // defined(MIXED_PRECISION)

3860

ADD_BLOCK(M0, c, bias);

3861

#endif // defined(MIXED_PRECISION)

3862

3863

#endif // defined(BROADCAST_BIAS)

3864

#endif // defined(BETA)

3865

3866

#if defined(ACTIVATION_TYPE)

3867

#if defined(MIXED_PRECISION)

3868

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3869

#else // defined(MIXED_PRECISION)

3870

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3871

#endif // defined(MIXED_PRECISION)

3872

#endif // defined(ACTIVATION_TYPE)

3873

3874

// Store output block

3875

#if defined(MIXED_PRECISION)

3876

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3877

#else // defined(MIXED_PRECISION)

3878

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3879

#endif // defined(MIXED_PRECISION)

3880

3881

#undef LHS_BLOCK_SIZE

3882

#undef LHS_OFFSET_X

3883

#undef LHS_STEP_X

3884

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3892

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3893

#endif // defined(LHS_TRANSPOSE)

3894

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3895

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3896

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3897

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3898

3899

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3906

({ \

3907

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3908

})

3909

#elif M0 == 2 // M0 == 2

3910

#define RHS_VFMA_M0xN0(i, a, b, c) \

3911

({ \

3912

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3913

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3914

})

3915

#elif M0 == 3 // M0 == 3

3916

#define RHS_VFMA_M0xN0(i, a, b, c) \

3917

({ \

3918

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3919

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3920

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3921

})

3922

#elif M0 == 4 // M0 == 4

3923

#define RHS_VFMA_M0xN0(i, a, b, c) \

3924

({ \

3925

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3926

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3927

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3928

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3929

})

3930

#elif M0 == 5 // M0 == 5

3931

#define RHS_VFMA_M0xN0(i, a, b, c) \

3932

({ \

3933

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3934

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3935

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3936

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3937

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3938

})

3939

#elif M0 == 6 // M0 == 6

3940

#define RHS_VFMA_M0xN0(i, a, b, c) \

3941

({ \

3942

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3943

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3944

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3945

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3946

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3947

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3948

})

3949

#elif M0 == 7 // M0 == 7

3950

#define RHS_VFMA_M0xN0(i, a, b, c) \

3951

({ \

3952

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3953

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3954

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3955

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3956

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3957

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3958

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3959

})

3960

#elif M0 == 8 // M0 == 8

3961

#define RHS_VFMA_M0xN0(i, a, b, c) \

3962

({ \

3963

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3964

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3965

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3966

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3967

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3968

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3969

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3970

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

3971

})

3972

#else // M0 not supported

3973

#error "M0 not supported"

3974

#endif // M0 not supported

3975

3976

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3977

* The LHS matrix is NOT reshaped

3978

* The RHS matrix is NOT reshaped

3979

*

3980

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3981

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

3982

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

3983

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

3984

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

3985

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3986

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3987

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

3988

* - N0 = 2, 3, 4, 8, 16

3989

* - K0 = 2, 3, 4, 8, 16

3990

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3991

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3992

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3993

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

3994

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

3995

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3996

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3997

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3998

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

3999

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4000

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

4001

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4002

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4003

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4004

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4005

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4006

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4007

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4008

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4009

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4010

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4011

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4012

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4013

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4014

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4015

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4016

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4017

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4018

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4019

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4020

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4021

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4022

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4023

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4024

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4025

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4026

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4027

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4028

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4029

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4030

*/

4031

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4032

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4033

#if defined(BETA)

4034

IMAGE_DECLARATION(bias),

4035

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4036

IMAGE_DECLARATION(dst),

4037

uint lhs_stride_z,

4038

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4039

#if defined(BETA)

4040

uint bias_stride_z,

4041

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4042

uint dst_stride_z

4043

#if defined(REINTERPRET_INPUT_AS_3D)

4044

,

4045

uint lhs_cross_plane_pad

4046

#endif // REINTERPRET_INPUT_AS_3D

4047

#if defined(REINTERPRET_OUTPUT_AS_3D)

4048

,

4049

uint dst_cross_plane_pad

4050

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4055

4056

// RHS offset and step X

4057

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4058

4059

uint x = get_global_id(0);

4060

uint y = get_global_id(1);

4061

uint z = get_global_id(2);

4062

4063

#if defined(DUMMY_WORK_ITEMS)

4064

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4069

4070

// Compute LHS matrix address

4071

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

4072

4073

// Compute RHS matrix address

4074

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4075

4076

#if defined(MATRIX_B_DEPTH)

4077

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4078

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4079

#else // defined(MATRIX_B_DEPTH)

4080

rhs_offset += z * rhs_stride_z;

4081

#endif // defined(MATRIX_B_DEPTH)

4082

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4083

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4084

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4085

4086

#if defined(REINTERPRET_INPUT_AS_3D)

4087

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4088

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

4089

4090

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4091

// multiply lhs_stride_z by DEPTH_GEMM3D

4092

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4093

4094

#else // defined(REINTERPRET_INPUT_AS_3D)

4095

4096

// Add offset for batched GEMM

4097

lhs_offset += z * lhs_stride_z;

4098

4099

#endif // defined(REINTERPRET_INPUT_AS_3D)

4100

4101

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4102

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4103

4104

int i = 0;

4105

for(; i <= (K - K0); i += K0)

4106

{

4107

// Supported cases (M0, K0):

4108

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4109

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4110

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4111

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4112

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4113

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4114

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4115

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4116

// Load values from LHS matrix

4117

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4118

4119

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4120

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4121

4122

RHS_VFMA_M0xN0(0, a, b0, c);

4123

RHS_VFMA_M0xN0(1, a, b1, c);

4124

#if K0 > 2

4125

RHS_VFMA_M0xN0(2, a, b2, c);

4126

#endif // K0 > 2

4127

#if K0 > 3

4128

RHS_VFMA_M0xN0(3, a, b3, c);

4129

#endif // K0 > 3

4130

#if K0 > 4

4131

RHS_VFMA_M0xN0(4, a, b4, c);

4132

RHS_VFMA_M0xN0(5, a, b5, c);

4133

RHS_VFMA_M0xN0(6, a, b6, c);

4134

RHS_VFMA_M0xN0(7, a, b7, c);

4135

#endif // K0 > 4

4136

#if K0 > 8

4137

RHS_VFMA_M0xN0(8, a, b8, c);

4138

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4139

RHS_VFMA_M0xN0(A, a, bA, c);

4140

RHS_VFMA_M0xN0(B, a, bB, c);

4141

RHS_VFMA_M0xN0(C, a, bC, c);

4142

RHS_VFMA_M0xN0(D, a, bD, c);

4143

RHS_VFMA_M0xN0(E, a, bE, c);

4144

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4145

#endif // K0 > 8

4146

4147

lhs_offset += K0 * sizeof(DATA_TYPE);

4148

rhs_offset += K0 * rhs_stride_y;

4149

}

4150

4151

// Left-over accumulations

4152

for(; i < K; ++i)

4153

{

4154

// Load values from LHS matrix

4155

VEC_DATA_TYPE(DATA_TYPE, 2)

4156

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4157

#if M0 > 1

4158

VEC_DATA_TYPE(DATA_TYPE, 2)

4159

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4160

#endif // M0 > 1

4161

#if M0 > 2

4162

VEC_DATA_TYPE(DATA_TYPE, 2)

4163

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4164

#endif // M0 > 2

4165

#if M0 > 3

4166

VEC_DATA_TYPE(DATA_TYPE, 2)

4167

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4168

#endif // M0 > 3

4169

#if M0 > 4

4170

VEC_DATA_TYPE(DATA_TYPE, 2)

4171

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4172

#endif // M0 > 4

4173

#if M0 > 5

4174

VEC_DATA_TYPE(DATA_TYPE, 2)

4175

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4176

#endif // M0 > 5

4177

#if M0 > 6

4178

VEC_DATA_TYPE(DATA_TYPE, 2)

4179

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4180

#endif // M0 > 6

4181

#if M0 > 7

4182

VEC_DATA_TYPE(DATA_TYPE, 2)

4183

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4184

#endif // M0 > 7

4185

4186

VEC_DATA_TYPE(DATA_TYPE, N0)

4187

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4188

RHS_VFMA_M0xN0(0, a, b, c);

4189

4190

lhs_offset += sizeof(DATA_TYPE);

4191

rhs_offset += rhs_stride_y;

4192

}

4193

4194

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

4195

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4196

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4197

4198

#if defined(REINTERPRET_OUTPUT_AS_3D)

4199

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4200

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

4201

4202

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4203

// multiply dst_stride_z by DEPTH_GEMM3D

4204

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4205

4206

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4207

4208

// Add offset for batched GEMM

4209

dst_addr += z * dst_stride_z;

4210

4211

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4212

4213

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4214

#if defined(ALPHA)

4215

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4216

#endif // defined(ALPHA)

4217

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4218

// Add beta*bias

4219

#if defined(BETA)

4220

#if defined(BROADCAST_BIAS)

4221

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4222

4223

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4224

4225

#ifndef UNIT_BETA

4226

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4227

#endif // UNIT_BIAS

4228

4229

// c = c + bias[broadcasted]

4230

ADD_BLOCK_BROADCAST(M0, c, bias0);

4231

4232

#else // defined(BROADCAST_BIAS)

4233

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

4234

2) * bias_stride_z;

4235

4236

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4237

4238

#ifndef UNIT_BETA

4239

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4244

4245

#endif // defined(BROADCAST_BIAS)

4246

#endif // defined(BETA)

4247

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4248

#if defined(ACTIVATION_TYPE)

4249

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

4250

#endif // defined(ACTIVATION_TYPE)

4251

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4252

// Store output block

4253

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

4254

4255

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4260

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4261

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4262

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4263

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4264

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4265

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4266

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4267

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4268

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4269

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4270

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4271

* The activation function is performed after the bias addition

4272

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4273

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4274

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4275

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4276

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4277

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4278

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4279

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4280

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4281

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4282

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4283

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4284

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4285

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4286

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4287

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4288

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4289

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4290

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4291

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4292

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4293

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4294

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4295

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4296

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4297

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4298

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4299

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4300

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4301

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4302

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4303

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4304

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4305

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4306

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4307

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4308

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

4309

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4310

#if defined(BETA)

4311

IMAGE_DECLARATION(src2),

4312

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4313

IMAGE_DECLARATION(dst),

4314

uint src0_stride_z,

4315

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4316

#if defined(BETA)

4317

uint src2_stride_z,

4318

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4319

uint dst_stride_z

4320

#if defined(REINTERPRET_OUTPUT_AS_3D)

4321

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4322

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4323

#endif // REINTERPRET_OUTPUT_AS_3D

4324

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4325

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4326

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4327

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4328

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4329

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4330

// Offset

4331

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4332

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4333

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4334

// src_addr_a = address of matrix A

4335

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4336

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4337

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4338

4339

#if defined(MATRIX_B_DEPTH)

4340

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4341

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4342

#else // defined(MATRIX_B_DEPTH)

4343

src1_addr_in_bytes += z * src1_stride_z;

4344

#endif // defined(MATRIX_B_DEPTH)

4345

4346

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4347

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4348

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4349

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4350

__global float *src_end_addr_b = src_addr_b + COLS_B;

4351

4352

src_addr_a += offset_row_a;

4353

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4354

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4355

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4360

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4361

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4362

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4363

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4364

float4 a0 = vload4(0, src_addr_a);

4365

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4366

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4367

c0 += (float4)a0.s0 * b0;

4368

c1 += (float4)a0.s1 * b0;

4369

c2 += (float4)a0.s2 * b0;

4370

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4371

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4372

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4373

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4374

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4375

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4376

c0 += (float4)a0.s0 * b0;

4377

c1 += (float4)a0.s1 * b0;

4378

c2 += (float4)a0.s2 * b0;

4379

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4380

}

4381

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4382

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4383

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4384

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4385

float4 a0 = vload4(0, src_addr_a);

4386

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4387

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4388

c0 += (float4)a0.s0 * b0;

4389

c1 += (float4)a0.s1 * b0;

4390

c2 += (float4)a0.s2 * b0;

4391

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4392

}

4393

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4394

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4395

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4396

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4397

// Compute dst address

4398

__global uchar *dst_addr = offset(&dst, 0, 0);

4399

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4400

uint4 zout = 0;

4401

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4402

#if defined(REINTERPRET_OUTPUT_AS_3D)

4403

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4404

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4405

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4410

// |******************|

4411

// | cross_plane_pad |

4412

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4417

4418

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4419

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4420

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4421

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4422

// Add offset due to the cross plane paddings

4423

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4424

4425

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4426

// multiply dst_stride_z by DEPTH_GEMM3D

4427

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4428

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4429

// Add offset for batched GEMM

4430

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4431

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4432

4433

// Multiply by the weight of matrix-matrix product and store the result

4434

#if defined(ALPHA)

4435

SCALE_BLOCK(4, float, c, ALPHA);

4436

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4441

4442

#if defined(BROADCAST_BIAS)

4443

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4444

4445

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4446

4447

#ifndef UNIT_BETA

4448

SCALE_BLOCK(1, float, bias, BETA);

4449

#endif // UNIT_BIAS

4450

4451

// c = c + bias[broadcasted]

4452

ADD_BLOCK_BROADCAST(4, c, bias0);

4453

4454

#else // defined(BROADCAST_BIAS)

4455

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4456

2) * src2_stride_z;

4457

4458

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4459

4460

#ifndef UNIT_BETA

4461

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4466

4467

#endif // defined(BROADCAST_BIAS)

4468

#endif // defined(BETA)

4469

4470

#if defined(ACTIVATION_TYPE)

4471

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4472

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4473

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4474

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4475

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4476

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4477

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4478

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4479

}

4480

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4481

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4482

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4483

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4484

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4485

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4486

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4487

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4488

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4489

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4490

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4491

* The activation function is performed after the bias addition

4492

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4493

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4494

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4495

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4496

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4497

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4498

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4499

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4500

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4501

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4502

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4503

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4504

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4505

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4506

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4507

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4508

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4509

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4510

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4511

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4512

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4513

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4514

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4515

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4516

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4517

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4518

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4519

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4520

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4521

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4522

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4523

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4524

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4525

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4526

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4527

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4528

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

4529

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4530

#if defined(BETA)

4531

IMAGE_DECLARATION(src2),

4532

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4533

IMAGE_DECLARATION(dst),

4534

uint src0_stride_z,

4535

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4536

#if defined(BETA)

4537

uint src2_stride_z,

4538

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4539

uint dst_stride_z

4540

#if defined(REINTERPRET_OUTPUT_AS_3D)

4541

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4542

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4543

#endif // REINTERPRET_OUTPUT_AS_3D

4544

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4545

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4546

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4547

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4548

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4549

4550

// Offset

4551

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4552

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

4553

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4554

// src_addr_a = address of matrix A

4555

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4556

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4557

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4558

4559

#if defined(MATRIX_B_DEPTH)

4560

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4561

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4562

#else // defined(MATRIX_B_DEPTH)

4563

src1_addr_in_bytes += z * src1_stride_z;

4564

#endif // defined(MATRIX_B_DEPTH)

4565

4566

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4567

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4568

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4569

src_addr_a += offset_row_a;

4570

src_addr_b += offset_row_b;

4571

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4572

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4577

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4578

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

4579

4580

int i = 0;

4581

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4582

{

4583

// Load values from matrix A (interleaved) and matrix B (transposed)

4584

float4 a0 = vload4(0, src_addr_a);

4585

float4 b0 = vload4(0, src_addr_b);

4586

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4587

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4588

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4589

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4590

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4591

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4592

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4593

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4594

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4595

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4596

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4597

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4598

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4599

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4600

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4601

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4602

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4603

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4604

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4605

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4606

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4607

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4608

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4609

4610

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4611

a0 = vload4(0, src_addr_a);

4612

b0 = vload4(0, src_addr_b);

4613

4614

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4615

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4616

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4617

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4618

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4619

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4620

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4621

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4622

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4623

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4624

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4625

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4626

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4627

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4628

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4629

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4630

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4631

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4632

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4633

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4634

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4635

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4636

4637

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4638

a0 = vload4(0, src_addr_a);

4639

b0 = vload4(0, src_addr_b);

4640

4641

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4642

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4643

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4644

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4645

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4646

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4647

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4648

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4649

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4650

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4651

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4652

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4653

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4654

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4655

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4656

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4657

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4659

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4660

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4661

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4662

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4663

4664

// Load values from matrix A (interleaved) and matrix B (transposed)

4665

a0 = vload4(0, src_addr_a);

4666

b0 = vload4(0, src_addr_b);

4667

4668

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4669

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4670

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4671

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4672

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4673

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4674

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4675

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4676

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4677

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4678

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4679

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4680

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4681

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4682

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4683

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4684

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4685

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4686

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4687

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4688

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4689

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4690

}

4691

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4692

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4693

{

4694

// Load values from matrix A (interleaved) and matrix B (transposed)

4695

float4 a0 = vload4(0, src_addr_a);

4696

float4 b0 = vload4(0, src_addr_b);

4697

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4698

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4699

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4700

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4701

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4702

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4703

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4704

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4705

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4706

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4707

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4708

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4709

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4710

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4711

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4712

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4713

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4714

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4715

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4716

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4717

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4718

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4719

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4720

}

4721

4722

// Compute destination address

4723

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4724

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4725

// Compute dst address

4726

__global uchar *dst_addr = offset(&dst, 0, 0);

4727

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4728

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4729

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4730

#if defined(REINTERPRET_OUTPUT_AS_3D)

4731

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4732

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4733

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4738

// |******************|

4739

// | cross_plane_pad |

4740

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4745

4746

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4747

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4748

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4749

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4750

// Add offset due to the cross plane paddings

4751

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4752

4753

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4754

// multiply dst_stride_z by DEPTH_GEMM3D

4755

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4756

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4757

// Add offset for batched GEMM

4758

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4759

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4760

4761

// Multiply by the weight of matrix-matrix product and store the result

4762

#if defined(ALPHA)

4763

SCALE_BLOCK(4, float, c, ALPHA);

4764

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4769

4770

#if defined(BROADCAST_BIAS)

4771

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4772

4773

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4774

4775

#ifndef UNIT_BETA

4776

SCALE_BLOCK(1, float, bias, BETA);

4777

#endif // UNIT_BIAS

4778

4779

// c = c + bias[broadcasted]

4780

ADD_BLOCK_BROADCAST(4, c, bias0);

4781

4782

#else // defined(BROADCAST_BIAS)

4783

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4784

2) * src2_stride_z;

4785

4786

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4787

4788

#ifndef UNIT_BETA

4789

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4794

4795

#endif // defined(BROADCAST_BIAS)

4796

#endif // defined(BETA)

4797

4798

#if defined(ACTIVATION_TYPE)

4799

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4800

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4801

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4802

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4803

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4804

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4805

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4806

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4807

}

4808

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4809

// Undefine local defines

4810

#undef COLS_MTX_B

4811

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4812

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4813

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4814

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4815

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4816

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4817

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4818

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4819

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4820

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4821

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4822

* The activation function is performed after the bias addition

4823

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4824

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4825

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4826

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4827

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4828

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4829

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4830

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4831

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4832

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4833

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4834

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4835

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4836

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4837

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4838

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4839

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4840

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4841

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4842

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4843

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4844

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4845

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4846

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4847

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4848

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4849

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4850

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4851

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4852

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4853

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4854

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4855

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4856

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4857

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4858

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4859

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

4860

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4861

#if defined(BETA)

4862

IMAGE_DECLARATION(src2),

4863

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4864

IMAGE_DECLARATION(dst),

4865

uint src0_stride_z,

4866

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4867

#if defined(BETA)

4868

uint src2_stride_z,

4869

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4870

uint dst_stride_z

4871

#if defined(REINTERPRET_OUTPUT_AS_3D)

4872

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4873

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4874

#endif // REINTERPRET_OUTPUT_AS_3D

4875

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4876

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4877

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4878

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4879

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4880

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4881

// Offset

4882

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4883

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4884

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4885

// src_addr_a = address of matrix A

4886

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4887

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4888

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4889

4890

#if defined(MATRIX_B_DEPTH)

4891

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4892

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4893

#else // defined(MATRIX_B_DEPTH)

4894

src1_addr_in_bytes += z * src1_stride_z;

4895

#endif // defined(MATRIX_B_DEPTH)

4896

4897

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4898

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4899

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4900

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4901

__global half *src_end_addr_b = src_addr_b + COLS_B;

4902

4903

src_addr_a += offset_row_a;

4904

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4905

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4906

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4911

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4912

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4913

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4914

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4915

half4 a0 = vload4(0, src_addr_a);

4916

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4917

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4918

c0 += (half8)a0.s0 * b0;

4919

c1 += (half8)a0.s1 * b0;

4920

c2 += (half8)a0.s2 * b0;

4921

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4922

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4923

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4924

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4925

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4926

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4927

c0 += (half8)a0.s0 * b0;

4928

c1 += (half8)a0.s1 * b0;

4929

c2 += (half8)a0.s2 * b0;

4930

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4931

}

4932

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4933

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4934

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4935

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4936

half4 a0 = vload4(0, src_addr_a);

4937

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4938

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4939

c0 += (half8)a0.s0 * b0;

4940

c1 += (half8)a0.s1 * b0;

4941

c2 += (half8)a0.s2 * b0;

4942

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4943

}

4944

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4945

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4946

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4947

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4948

// Compute dst address

4949

__global uchar *dst_addr = offset(&dst, 0, 0);

4950

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4951

uint4 zout = 0;

4952

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4953

#if defined(REINTERPRET_OUTPUT_AS_3D)

4954

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4955

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4956

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4961

// |******************|

4962

// | cross_plane_pad |

4963

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4968

4969

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4970

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4971

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4972

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4973

// Add offset due to the cross plane paddings

4974

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4975

4976

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4977

// multiply dst_stride_z by DEPTH_GEMM3D

4978

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4979

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4980

// Add offset for batched GEMM

4981

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4982

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4983

4984

// Multiply by the weight of matrix-matrix product and store the result

4985

#if defined(ALPHA)

4986

SCALE_BLOCK(4, half, c, ALPHA);

4987

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4992

4993

#if defined(BROADCAST_BIAS)

4994

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

4995

4996

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

4997

4998

#ifndef UNIT_BETA

4999

SCALE_BLOCK(1, half, bias, BETA);

5000

#endif // UNIT_BIAS

5001

5002

// c = c + bias[broadcasted]

5003

ADD_BLOCK_BROADCAST(4, c, bias0);

5004

5005

#else // defined(BROADCAST_BIAS)

5006

5007

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5008

2) * src2_stride_z;

5009

5010

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5011

5012

#ifndef UNIT_BETA

5013

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5018

5019

#endif // defined(BROADCAST_BIAS)

5020

#endif // defined(BETA)

5021

5022

#if defined(ACTIVATION_TYPE)

5023

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5024

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5025

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5026

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5027

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5028

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5029

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5030

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5031

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5032

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5033

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5034

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5035

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5036

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5037

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5038

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5039

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5040

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5041

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5042

* The activation function is performed after the bias addition

5043

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5044

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5045

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5046

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5047

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5048

*

5049

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5050

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5051

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5052

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5053

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5054

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5055

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5056

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5057

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5058

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5059

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5060

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5061

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5062

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5063

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5064

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5065

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5066

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5067

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5068

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5069

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5070

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5071

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5072

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5073

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5074

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5075

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5076

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5077

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5078

*/

5079

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

5080

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5081

#if defined(BETA)

5082

IMAGE_DECLARATION(src2),

5083

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5084

IMAGE_DECLARATION(dst),

5085

uint src0_stride_z,

5086

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5087

#if defined(BETA)

5088

uint src2_stride_z,

5089

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5090

uint dst_stride_z

5091

#if defined(REINTERPRET_OUTPUT_AS_3D)

5092

,

5093

uint cross_plane_pad

5094

#endif // REINTERPRET_OUTPUT_AS_3D

5095

)

5096

{

5097

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5098

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5099

int z = get_global_id(2);

5100

5101

// Offset

5102

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5103

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5104

5105

// src_addr_a = address of matrix A

5106

// src_addr_b = address of matrix B

5107

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5108

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5109

5110

#if defined(MATRIX_B_DEPTH)

5111

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5112

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5113

#else // defined(MATRIX_B_DEPTH)

5114

src1_addr_in_bytes += z * src1_stride_z;

5115

#endif // defined(MATRIX_B_DEPTH)

5116

5117

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5118

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5119

5120

// Compute end row address for matrix B

5121

__global half *src_end_addr_b = src_addr_b + COLS_B;

5122

5123

src_addr_a += offset_row_a;

5124

src_addr_b += offset_row_b;

5125

5126

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5131

5132

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

5133

{

5134

// Load values from matrix A (interleaved) and matrix B (transposed)

5135

float4 a0 = convert_float4(vload4(0, src_addr_a));

5136

float8 b0 = convert_float8(vload8(0, src_addr_b));

5137

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5138

c0 += (float8)a0.s0 * b0;

5139

c1 += (float8)a0.s1 * b0;

5140

c2 += (float8)a0.s2 * b0;

5141

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5142

5143

// Load values from matrix A (interleaved) and matrix B (transposed)

5144

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

5145

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

5146

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5147

c0 += (float8)a0.s0 * b0;

5148

c1 += (float8)a0.s1 * b0;

5149

c2 += (float8)a0.s2 * b0;

5150

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5151

}

5152

5153

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

5154

{

5155

// Load values from matrix A (interleaved) and matrix B (transposed)

5156

float4 a0 = convert_float4(vload4(0, src_addr_a));

5157

float8 b0 = convert_float8(vload8(0, src_addr_b));

5158

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5159

c0 += (float8)a0.s0 * b0;

5160

c1 += (float8)a0.s1 * b0;

5161

c2 += (float8)a0.s2 * b0;

5162

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5163

}

5164

5165

// Compute destination address

5166

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5167

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5168

// Compute dst address

5169

__global uchar *dst_addr = offset(&dst, 0, 0);

5170

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5171

uint4 zout = 0;

5172

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5173

#if defined(REINTERPRET_OUTPUT_AS_3D)

5174

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5175

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5181

// |******************|

5182

// | cross_plane_pad |

5183

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5188

5189

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5190

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5191

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5192

5193

// Add offset due to the cross plane paddings

5194

zout *= (cross_plane_pad * dst_stride_y);

5195

5196

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5197

// multiply dst_stride_z by DEPTH_GEMM3D

5198

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5199

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5200

// Add offset for batched GEMM

5201

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5202

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5203

5204

// Multiply by the weight of matrix-matrix product and store the result

5205

#if defined(ALPHA)

5206

SCALE_BLOCK(4, float, c, ALPHA);

5207

#endif // defined(ALPHA)

5208

5209

#if defined(BETA)

5210

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5211

5212

#if defined(BROADCAST_BIAS)

5213

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5214

5215

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5216

5217

float8 bias_f0 = convert_float8(bias0);

5218

5219

#ifndef UNIT_BETA

5220

SCALE_BLOCK(1, float, bias_f, BETA);

5221

#endif // UNIT_BIAS

5222

5223

// c = c + bias[broadcasted]

5224

ADD_BLOCK_BROADCAST(4, c, bias_f0);

5225

5226

#else // defined(BROADCAST_BIAS)

5227

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5228

2) * src2_stride_z;

5229

5230

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5231

5232

float8 bias_f0 = convert_float8(bias0);

5233

float8 bias_f1 = convert_float8(bias1);

5234

float8 bias_f2 = convert_float8(bias2);

5235

float8 bias_f3 = convert_float8(bias3);

5236

5237

#ifndef UNIT_BETA

5238

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

5243

5244

#endif // defined(BROADCAST_BIAS)

5245

#endif // defined(BETA)

5246

5247

half8 c_h0 = convert_half8(c0);

5248

half8 c_h1 = convert_half8(c1);

5249

half8 c_h2 = convert_half8(c2);

5250

half8 c_h3 = convert_half8(c3);

5251

5252

#if defined(ACTIVATION_TYPE)

5253

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

5254

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5255

5256

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5257

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5258

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5259

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5260

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5261

}

5262

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5263

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5264

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5265

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5266

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5267

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5268

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5269

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5270

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5271

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5272

* The activation function is performed after the bias addition

5273

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5274

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5275

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5276

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5277

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5278

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5279

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5280

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5281

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5282

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5283

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5284

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5285

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5286

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5287

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5288

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5289

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5290

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5291

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5292

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5293

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5294

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5295

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5296

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5297

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5298

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5299

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5300

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5301

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5302

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5303

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5304

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5305

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5306

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5307

*/

5308

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

5309

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5310

#if defined(BETA)

5311

IMAGE_DECLARATION(src2),

5312

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5313

IMAGE_DECLARATION(dst),

5314

uint src0_stride_z,

5315

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5316

#if defined(BETA)

5317

uint src2_stride_z,

5318

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5319

uint dst_stride_z

5320

#if defined(REINTERPRET_OUTPUT_AS_3D)

5321

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5322

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5323

#endif // REINTERPRET_OUTPUT_AS_3D

5324

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5325

{

5326

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5327

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5328

int z = get_global_id(2);

5329

5330

// Offset

5331

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5332

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5333

5334

// src_addr_a = address of matrix A

5335

// src_addr_b = address of matrix B

5336

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5337

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5338

5339

#if defined(MATRIX_B_DEPTH)

5340

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5341

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5342

#else // defined(MATRIX_B_DEPTH)

5343

src1_addr_in_bytes += z * src1_stride_z;

5344

#endif // defined(MATRIX_B_DEPTH)

5345

5346

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5347

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5348

5349

// Compute end row address for matrix B

5350

__global half *src_end_addr_b = src_addr_b + COLS_B;

5351

5352

src_addr_a += offset_row_a;

5353

src_addr_b += offset_row_b;

5354

5355

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5360

5361

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

5362

5363

int i = 0;

5364

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

5365

{

5366

#if MULT_INTERLEAVE4X4_HEIGHT == 1

5367

// Load values from matrix A (interleaved) and matrix B (transposed)

5368

half8 a0 = vload8(0, src_addr_a);

5369

half8 b0 = vload8(0, src_addr_b);

5370

5371

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5372

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5373

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5374

c0 = fma((half8)a0.s0, b0, c0);

5375

c1 = fma((half8)a0.s1, b0, c1);

5376

c2 = fma((half8)a0.s2, b0, c2);

5377

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5378

5379

// Load values from matrix B (transposed)

5380

b0 = vload8(0, src_addr_b);

5381

5382

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5383

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5384

c0 = fma((half8)a0.s4, b0, c0);

5385

c1 = fma((half8)a0.s5, b0, c1);

5386

c2 = fma((half8)a0.s6, b0, c2);

5387

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5388

5389

// Load values from matrix A (interleaved) and matrix B (transposed)

5390

a0 = vload8(0, src_addr_a);

5391

b0 = vload8(0, src_addr_b);

5392

5393

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5394

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5395

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5396

c0 = fma((half8)a0.s0, b0, c0);

5397

c1 = fma((half8)a0.s1, b0, c1);

5398

c2 = fma((half8)a0.s2, b0, c2);

5399

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5400

5401

// Load values from matrix B (transposed)

5402

b0 = vload8(0, src_addr_b);

5403

5404

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5405

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5406

c0 = fma((half8)a0.s4, b0, c0);

5407

c1 = fma((half8)a0.s5, b0, c1);

5408

c2 = fma((half8)a0.s6, b0, c2);

5409

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5410

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

5411

// Load values from matrix A (interleaved) and matrix B (transposed)

5412

half4 a0 = vload4(0, src_addr_a);

5413

half8 b0 = vload8(0, src_addr_b);

5414

5415

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5416

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5417

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5418

c0 = fma((half8)a0.s0, b0, c0);

5419

c1 = fma((half8)a0.s1, b0, c1);

5420

c2 = fma((half8)a0.s2, b0, c2);

5421

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5422

5423

// Load values from matrix A (interleaved) and matrix B (transposed)

5424

a0 = vload4(0, src_addr_a);

5425

b0 = vload8(0, src_addr_b);

5426

5427

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5428

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5429

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5430

c0 = fma((half8)a0.s0, b0, c0);

5431

c1 = fma((half8)a0.s1, b0, c1);

5432

c2 = fma((half8)a0.s2, b0, c2);

5433

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5434

5435

// Load values from matrix A (interleaved) and matrix B (transposed)

5436

a0 = vload4(0, src_addr_a);

5437

b0 = vload8(0, src_addr_b);

5438

5439

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5440

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5441

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5442

c0 = fma((half8)a0.s0, b0, c0);

5443

c1 = fma((half8)a0.s1, b0, c1);

5444

c2 = fma((half8)a0.s2, b0, c2);

5445

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5446

5447

// Load values from matrix A (interleaved) and matrix B (transposed)

5448

a0 = vload4(0, src_addr_a);

5449

b0 = vload8(0, src_addr_b);

5450

5451

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5452

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5453

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5454

c0 = fma((half8)a0.s0, b0, c0);

5455

c1 = fma((half8)a0.s1, b0, c1);

5456

c2 = fma((half8)a0.s2, b0, c2);

5457

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5458

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

5459

}

5460

5461

for(; i < (int)(COLS_MTX_B); ++i)

5462

{

5463

// Load values from matrix A (interleaved) and matrix B (transposed)

5464

half4 a0 = vload4(0, src_addr_a);

5465

half8 b0 = vload8(0, src_addr_b);

5466

5467

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5468

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5469

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5470

c0 = fma((half8)a0.s0, b0, c0);

5471

c1 = fma((half8)a0.s1, b0, c1);

5472

c2 = fma((half8)a0.s2, b0, c2);

5473

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5474

}

5475

5476

// Compute destination address

5477

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5478

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5479

// Compute dst address

5480

__global uchar *dst_addr = offset(&dst, 0, 0);

5481

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5482

uint4 zout = 0;

5483

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5484

#if defined(REINTERPRET_OUTPUT_AS_3D)

5485

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5486

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5487

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5492

// |******************|

5493

// | cross_plane_pad |

5494

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5499

5500

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5501

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5502

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5503

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5504

// Add offset due to the cross plane paddings

5505

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5506

5507

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5508

// multiply dst_stride_z by DEPTH_GEMM3D

5509

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5510

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5511

// Add offset for batched GEMM

5512

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5513

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5514

5515

// Multiply by the weight of matrix-matrix product and store the result

5516

#if defined(ALPHA)

5517

SCALE_BLOCK(4, half, c, ALPHA);

5518

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5523

5524

#if defined(BROADCAST_BIAS)

5525

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5526

5527

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5528

5529

#ifndef UNIT_BETA

5530

SCALE_BLOCK(1, half, bias, BETA);

5531

#endif // UNIT_BIAS

5532

5533

// c = c + bias[broadcasted]

5534

ADD_BLOCK_BROADCAST(4, c, bias0);

5535

5536

#else // defined(BROADCAST_BIAS)

5537

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5538

2) * src2_stride_z;

5539

5540

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5541

5542

#ifndef UNIT_BETA

5543

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5548

5549

#endif // defined(BROADCAST_BIAS)

5550

#endif // defined(BETA)

5551

5552

#if defined(ACTIVATION_TYPE)

5553

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5554

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5555

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5556

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5557

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5558

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5559

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5560

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5561

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

5562

5563

// Undefine local defines

5564

#undef COLS_MTX_B

5565

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

5566

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5567

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

5568

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5569

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5570

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

5571

#if defined(DATA_TYPE)

5572

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5573

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

5574

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5575

* @note This OpenCL kernel works with floating point data types (F16/F32)

5576

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

5577

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5578

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5579

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5580

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5581

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5582

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5583

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5584

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5585

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5586

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5587

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5588

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5589

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5590

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5591

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5592

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5593

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5594

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5595

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5596

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5597

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5598

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5599

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5600

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5601

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5602

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5603

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5604

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5605

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5606

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5607

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5608

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5609

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5610

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5611

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5612

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5613

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5614

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5615

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5616

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5617

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5618

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5619

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5620

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5621

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5622

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

5623

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5624

#if defined(BETA)

5625

IMAGE_DECLARATION(src2),

5626

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5627

IMAGE_DECLARATION(dst),

5628

uint src0_stride_z,

5629

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5630

#if defined(BETA)

5631

uint src2_stride_z,

5632

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5633

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5634

#if defined(REINTERPRET_INPUT_AS_3D)

5635

,

5636

uint src_cross_plane_pad

5637

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5638

#if defined(REINTERPRET_OUTPUT_AS_3D)

5639

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5640

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5641

#endif // REINTERPRET_OUTPUT_AS_3D

5642

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5643

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5644

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5645

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5646

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5647

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5648

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5649

// Update address for the matrix A

5650

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5651

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5652

// Update address for the matrix B

5653

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5654

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5655

#if defined(REINTERPRET_INPUT_AS_3D)

5656

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5657

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5663

// |******************|

5664

// | cross_plane_pad |

5665

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5670

5671

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5672

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5673

zin = min(DEPTH_GEMM3D - 1, zin);

5674

5675

// Add offset due to the cross plane paddings

5676

zin *= (src_cross_plane_pad * src0_stride_y);

5677

5678

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5679

// multiply src0_stride_z by DEPTH_GEMM3D

5680

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5681

5682

#else // defined(REINTERPRET_INPUT_AS_3D)

5683

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5684

// Add offset for batched GEMM

5685

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5686

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5687

#endif // defined(REINTERPRET_INPUT_AS_3D)

5688

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5689

#if defined(MATRIX_B_DEPTH)

5690

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5691

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5692

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5693

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5694

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5695

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5696

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

5697

5698

VECTOR_TYPE acc0 = 0.0f;

5699

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5700

VECTOR_TYPE acc1 = 0.0f;

5701

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5702

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5703

VECTOR_TYPE acc2 = 0.0f;

5704

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5705

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5706

VECTOR_TYPE acc3 = 0.0f;

5707

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5708

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

5709

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5710

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5711

#if defined(REINTERPRET_INPUT_AS_3D)

5712

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5713

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5714

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5715

// Load values from matrix A

5716

VEC_DATA_TYPE(DATA_TYPE, 2)

5717

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5718

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5719

VEC_DATA_TYPE(DATA_TYPE, 2)

5720

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5721

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5722

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5723

VEC_DATA_TYPE(DATA_TYPE, 2)

5724

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5725

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5726

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5727

VEC_DATA_TYPE(DATA_TYPE, 2)

5728

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5729

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5730

#endif // defined(REINTERPRET_INPUT_AS_3D)

5731

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5732

// Load values from matrix B

5733

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

5734

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5735

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5736

// Accumulate

5737

acc0 += b0 * (VECTOR_TYPE)a0.s0;

5738

acc0 += b1 * (VECTOR_TYPE)a0.s1;

5739

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5740

acc1 += b0 * (VECTOR_TYPE)a1.s0;

5741

acc1 += b1 * (VECTOR_TYPE)a1.s1;

5742

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5743

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5744

acc2 += b0 * (VECTOR_TYPE)a2.s0;

5745

acc2 += b1 * (VECTOR_TYPE)a2.s1;

5746

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5747

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5748

acc3 += b0 * (VECTOR_TYPE)a3.s0;

5749

acc3 += b1 * (VECTOR_TYPE)a3.s1;

5750

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5751

}

5752

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5753

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5754

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5755

#if defined(REINTERPRET_INPUT_AS_3D)

5756

// Load values from matrix A

5757

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5758

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5759

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5760

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5762

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5763

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5764

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5765

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5766

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5767

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5768

// Load values from matrix A

5769

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5770

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5771

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5772

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5774

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5777

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5778

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5779

#endif // defined(REINTERPRET_INPUT_AS_3D)

5780

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5781

// Load values from matrix B

5782

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5783

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5784

// Accumulate

5785

acc0 += b0 * (VECTOR_TYPE)a0;

5786

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5787

acc1 += b0 * (VECTOR_TYPE)a1;

5788

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5789

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5790

acc2 += b0 * (VECTOR_TYPE)a2;

5791

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5792

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5793

acc3 += b0 * (VECTOR_TYPE)a3;

5794

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5795

}

5796

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5797

int z = get_global_id(2);

5798

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5799

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5800

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5801

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5802

// Compute dst address

5803

__global uchar *dst_addr = offset(&dst, 0, 0);

5804

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5805

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5806

5807

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5808

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5809

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5810

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5811

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5816

// |******************|

5817

// | cross_plane_pad |

5818

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5823

5824

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5825

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5826

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5827

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5828

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5829

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5830

5831

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5832

// multiply dst_stride_z by DEPTH_GEMM3D

5833

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5834

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5835

// Add offset for batched GEMM

5836

dst_addr += z * dst_stride_z;

5837

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5838

5839

// Multiply by the weight of matrix-matrix product and store the result

5840

#if defined(ALPHA)

5841

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

5842

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5847

5848

#if defined(BROADCAST_BIAS)

5849

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

5850

5851

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5852

5853

#ifndef UNIT_BETA

5854

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

5855

#endif // UNIT_BIAS

5856

5857

// c = c + bias[broadcasted]

5858

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5859

5860

#else // defined(BROADCAST_BIAS)

5861

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

5862

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5863

5864

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5865

5866

#ifndef UNIT_BETA

5867

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5872

5873

#endif // defined(BROADCAST_BIAS)

5874

#endif // defined(BETA)

5875

5876

#if defined(ACTIVATION_TYPE)

5877

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

5878

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5879

5880

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5881

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5882

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5883

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5884

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

5885

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5886

*

5887

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5888

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5889

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5890

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5891

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5892

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5893

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5894

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5895

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5896

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5897

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5898

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5899

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5900

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5901

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5902

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5903

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5904

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5905

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5906

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5907

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5908

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5909

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5910

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5911

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5912

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5913

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5914

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5915

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5916

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5917

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5918

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5919

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5920

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5921

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5922

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5923

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5924

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5925

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5926

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5927

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5928

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5929

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5930

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5931

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5932

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5933

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5934

*/

5935

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

5936

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5937

#if defined(BETA)

5938

IMAGE_DECLARATION(src2),

5939

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5940

IMAGE_DECLARATION(dst),

5941

uint src0_stride_z,

5942

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5943

#if defined(BETA)

5944

uint src2_stride_z,

5945

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5946

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5947

#if defined(REINTERPRET_INPUT_AS_3D)

5948

,

5949

uint src_cross_plane_pad

5950

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5951

#if defined(REINTERPRET_OUTPUT_AS_3D)

5952

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5953

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5954

#endif // REINTERPRET_OUTPUT_AS_3D

5955

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5956

{

5957

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5958

5959

// Compute starting address for matrix A and matrix B

5960

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5961

5962

// Update address for matrix A

5963

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5964

5965

// Update address for matrix B

5966

src_addr.s1 += idx * sizeof(float);

5967

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5968

#if defined(REINTERPRET_INPUT_AS_3D)

5969

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5970

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5976

// |******************|

5977

// | cross_plane_pad |

5978

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5983

5984

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5985

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5986

zin = min(DEPTH_GEMM3D - 1, zin);

5987

5988

// Add offset due to the cross plane paddings

5989

zin *= (src_cross_plane_pad * src0_stride_y);

5990

5991

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5992

// multiply src0_stride_z by DEPTH_GEMM3D

5993

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5994

5995

#else // defined(REINTERPRET_INPUT_AS_3D)

5996

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5997

// Add offset for batched GEMM

5998

src_addr.s0 += get_global_id(2) * src0_stride_z;

5999

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6000

#endif // defined(REINTERPRET_INPUT_AS_3D)

6001

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6002

#if defined(MATRIX_B_DEPTH)

6003

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6004

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6005

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6006

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6007

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6008

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6009

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6010

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6011

6012

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6013

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6014

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6015

6016

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6017

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6018

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6019

6020

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6021

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6022

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6023

6024

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6025

int i = 0;

6026

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6027

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6028

#if defined(REINTERPRET_INPUT_AS_3D)

6029

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6030

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6031

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6032

// Load values from matrix A and matrix B

6033

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6034

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6035

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6036

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6037

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6038

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6039

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6040

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6041

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6042

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6043

#endif // defined(REINTERPRET_INPUT_AS_3D)

6044

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6045

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6046

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6047

6048

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6049

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6050

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6051

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

6052

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6053

6054

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6055

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6056

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

6057

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

6058

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

6059

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6060

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6061

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6062

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6063

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6064

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

6065

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

6066

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

6067

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6068

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6069

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6070

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6071

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6072

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

6073

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

6074

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

6075

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6076

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6077

6078

// Load values from matrix A and matrix B

6079

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6080

src_addr.s1 += src1_stride_y;

6081

6082

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6083

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

6084

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

6085

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

6086

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6087

6088

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6089

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6090

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

6091

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

6092

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

6093

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6094

6095

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6096

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6097

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6098

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

6099

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

6100

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

6101

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6102

6103

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6104

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6105

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6106

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

6107

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

6108

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

6109

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6110

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6111

6112

// Load values from matrix A and matrix B

6113

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6114

src_addr.s1 += src1_stride_y;

6115

6116

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6117

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

6118

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

6119

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

6120

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6121

6122

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6123

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6124

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

6125

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

6126

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

6127

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6128

6129

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6130

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6131

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6132

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

6133

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

6134

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

6135

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6136

6137

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6138

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6139

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6140

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

6141

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

6142

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

6143

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6144

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6145

6146

// Load values from matrix A and matrix B

6147

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6148

src_addr.s1 += src1_stride_y;

6149

6150

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6151

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

6152

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

6153

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

6154

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6155

6156

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6157

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6158

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

6159

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

6160

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

6161

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6162

6163

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6164

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6165

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6166

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

6167

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

6168

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

6169

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6170

6171

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6172

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6173

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6174

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

6175

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

6176

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

6177

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6178

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6179

6180

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6181

}

6182

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6183

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6184

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6185

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6186

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6187

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6188

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6189

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6190

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6191

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6192

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6193

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6194

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6195

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6196

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6197

#else // defined(REINTERPRET_INPUT_AS_3D)

6198

// Load values from matrix A

6199

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6200

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6201

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6202

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6203

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6204

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6205

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6206

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6207

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6208

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6209

#endif // defined(REINTERPRET_INPUT_AS_3D)

6210

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6211

// Load values from matrix B

6212

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6213

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6214

6215

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6216

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6217

acc0.s1 = fma(a0, b0.s1, acc0.s1);

6218

acc0.s2 = fma(a0, b0.s2, acc0.s2);

6219

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6220

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6221

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6222

acc1.s1 = fma(a1, b0.s1, acc1.s1);

6223

acc1.s2 = fma(a1, b0.s2, acc1.s2);

6224

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6225

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6226

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6227

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6228

acc2.s1 = fma(a2, b0.s1, acc2.s1);

6229

acc2.s2 = fma(a2, b0.s2, acc2.s2);

6230

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6231

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6232

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6233

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6234

acc3.s1 = fma(a3, b0.s1, acc3.s1);

6235

acc3.s2 = fma(a3, b0.s2, acc3.s2);

6236

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6237

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6238

6239

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6240

}

6241

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6242

int z = get_global_id(2);

6243

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6244

// Compute destination address

6245

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6246

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6247

// Compute dst address

6248

__global uchar *dst_addr = offset(&dst, 0, 0);

6249

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6250

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6251

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6252

#if defined(REINTERPRET_OUTPUT_AS_3D)

6253

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6254

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6255

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6260

// |******************|

6261

// | cross_plane_pad |

6262

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6267

6268

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6269

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6270

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6271

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6272

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6273

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6274

6275

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6276

// multiply dst_stride_z by DEPTH_GEMM3D

6277

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6278

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6279

// Add offset for batched GEMM

6280

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6281

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6282

6283

// Multiply by the weight of matrix-matrix product and store the result

6284

#if defined(ALPHA)

6285

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6286

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6291

6292

#if defined(BROADCAST_BIAS)

6293

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

6294

6295

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6296

6297

#ifndef UNIT_BETA

6298

SCALE_BLOCK(1, float, bias, BETA);

6299

#endif // UNIT_BIAS

6300

6301

// acc = acc + bias[broadcasted]

6302

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6303

6304

#else // defined(BROADCAST_BIAS)

6305

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

6306

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6307

6308

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6309

6310

#ifndef UNIT_BETA

6311

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6316

6317

#endif // defined(BROADCAST_BIAS)

6318

#endif // defined(BETA)

6319

6320

#if defined(ACTIVATION_TYPE)

6321

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6322

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6323

6324

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6325

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6326

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6327

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6328

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6329

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6330

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6331

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6332

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6333

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6334

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6335

}

6336

6337

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

6338

*

6339

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

6340

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

6341

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6342

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

6343

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6344

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6345

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6346

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6347

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6348

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6349

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6350

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6351

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6352

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6353

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6354

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6355

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6356

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6357

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6358

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6359

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6360

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6361

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6362

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6363

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6364

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6365

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6366

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6367

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6368

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6369

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6370

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6371

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6372

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6373

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6374

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6375

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6376

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6377

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6378

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6379

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6380

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6381

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6382

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6383

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6384

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6385

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6386

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6387

*/

6388

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

6389

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6390

#if defined(BETA)

6391

IMAGE_DECLARATION(src2),

6392

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6393

IMAGE_DECLARATION(dst),

6394

uint src0_stride_z,

6395

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6396

#if defined(BETA)

6397

uint src2_stride_z,

6398

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6399

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6400

#if defined(REINTERPRET_INPUT_AS_3D)

6401

,

6402

uint src_cross_plane_pad

6403

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6404

#if defined(REINTERPRET_OUTPUT_AS_3D)

6405

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6406

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6407

#endif // REINTERPRET_OUTPUT_AS_3D

6408

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6409

{

6410

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6411

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6412

6413

// Compute starting address for matrix A and Matrix B

6414

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6415

6416

// Update address for the matrix A

6417

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6418

6419

// Update address for the matrix B

6420

src_addr.s1 += idx * sizeof(float);

6421

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6422

#if defined(REINTERPRET_INPUT_AS_3D)

6423

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6424

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6430

// |******************|

6431

// | cross_plane_pad |

6432

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6437

6438

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6439

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6440

zin = min(DEPTH_GEMM3D - 1, zin);

6441

6442

// Add offset due to the cross plane paddings

6443

zin *= (src_cross_plane_pad * src0_stride_y);

6444

6445

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6446

// multiply src0_stride_z by DEPTH_GEMM3D

6447

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6448

6449

#else // defined(REINTERPRET_INPUT_AS_3D)

6450

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6451

// Add offset for batched GEMM

6452

src_addr.s0 += get_global_id(2) * src0_stride_z;

6453

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6454

#endif // defined(REINTERPRET_INPUT_AS_3D)

6455

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6456

#if defined(MATRIX_B_DEPTH)

6457

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6458

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6459

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6460

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6461

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6462

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6463

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6464

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6465

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6466

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6467

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6469

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6471

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6472

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6473

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6474

6475

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6476

int i = 0;

6477

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6478

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6479

#if defined(REINTERPRET_INPUT_AS_3D)

6480

// Load values from matrix A

6481

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

6482

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6483

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6484

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6485

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6486

6487

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6488

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6489

src_addr.s1 += src1_stride_y;

6490

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6491

src_addr.s1 += src1_stride_y;

6492

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6493

src_addr.s1 += src1_stride_y;

6494

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6495

src_addr.s1 += src1_stride_y;

6496

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6497

src_addr.s1 += src1_stride_y;

6498

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6499

src_addr.s1 += src1_stride_y;

6500

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6501

src_addr.s1 += src1_stride_y;

6502

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6503

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6504

6505

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6506

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6507

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

6508

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

6509

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

6510

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

6511

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

6512

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

6513

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6514

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6515

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6516

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

6517

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

6518

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

6519

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

6520

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

6521

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

6522

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6523

6524

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6525

#if defined(REINTERPRET_INPUT_AS_3D)

6526

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6527

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6528

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6529

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6530

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

6531

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

6532

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

6533

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

6534

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

6535

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

6536

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

6537

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6538

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6539

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

6540

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

6541

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

6542

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

6543

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

6544

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

6545

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

6546

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6547

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6548

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6549

#if defined(REINTERPRET_INPUT_AS_3D)

6550

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6551

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6552

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6553

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6554

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

6555

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

6556

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

6557

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

6558

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

6559

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

6560

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

6561

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6562

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6563

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

6564

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

6565

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

6566

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

6567

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

6568

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

6569

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

6570

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6571

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6572

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6573

#if defined(REINTERPRET_INPUT_AS_3D)

6574

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6575

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6576

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6577

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6578

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

6579

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

6580

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

6581

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

6582

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

6583

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

6584

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

6585

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6586

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6587

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

6588

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

6589

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

6590

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

6591

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

6592

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

6593

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

6594

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6595

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6596

6597

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6598

}

6599

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6600

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6601

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6602

#if defined(REINTERPRET_INPUT_AS_3D)

6603

// Load values from matrix A

6604

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6605

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6606

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6608

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6609

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6610

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6611

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6612

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6613

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6614

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6615

// Load values from matrix A

6616

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6617

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6618

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6619

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6621

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6622

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6623

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6624

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6625

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6626

#endif // defined(REINTERPRET_INPUT_AS_3D)

6627

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6628

// Load values from matrix B

6629

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6630

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6631

6632

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6633

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6634

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6635

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6636

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6637

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6638

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6640

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6641

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6642

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6643

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6644

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6645

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6646

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6647

6648

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6649

}

6650

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6651

int z = get_global_id(2);

6652

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6653

// Compute destination address

6654

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6655

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6656

// Compute dst address

6657

__global uchar *dst_addr = offset(&dst, 0, 0);

6658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6659

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6660

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6661

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6662

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6663

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6664

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6665

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6670

// |******************|

6671

// | cross_plane_pad |

6672

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6677

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6678

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6679

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6680

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6681

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6682

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6683

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6684

6685

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6686

// multiply dst_stride_z by DEPTH_GEMM3D

6687

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6688

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6689

// Add offset for batched GEMM

6690

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6691

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6692

6693

// Multiply by the weight of matrix-matrix product and store the result

6694

#if defined(ALPHA)

6695

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6696

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6701

6702

#if defined(BROADCAST_BIAS)

6703

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

6704

6705

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6706

6707

#ifndef UNIT_BETA

6708

SCALE_BLOCK(1, float, bias, BETA);

6709

#endif // UNIT_BIAS

6710

6711

// acc = acc + bias[broadcasted]

6712

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6713

6714

#else // defined(BROADCAST_BIAS)

6715

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

6716

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6717

6718

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6719

6720

#ifndef UNIT_BETA

6721

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6726

6727

#endif // defined(BROADCAST_BIAS)

6728

#endif // defined(BETA)

6729

6730

#if defined(ACTIVATION_TYPE)

6731

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6732

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6733

6734

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6735

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6737

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6738

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6739

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6740

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6741

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6742

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6743

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6744

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6745

}

6746

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6747

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6748

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

6749

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6750

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

6751

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6752

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

6753

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6754

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6755

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6756

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6757

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6758

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6759

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6760

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6761

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

6762

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6763

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6764

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6765

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6766

*

6767

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

6768

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6769

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6770

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6771

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6772

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6773

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6774

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6775

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6776

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6777

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6778

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6779

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6780

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6781

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6782

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6783

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6784

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6785

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6786

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6787

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6788

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6789

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6790

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6791

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6792

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6793

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6794

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6795

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6796

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

6797

*/

6798

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

6799

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6800

#if defined(BETA)

6801

IMAGE_DECLARATION(src2),

6802

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6803

IMAGE_DECLARATION(dst),

6804

uint src0_stride_z,

6805

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6806

#if defined(BETA)

6807

uint src2_stride_z,

6808

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6809

uint dst_stride_z

6810

#if defined(REINTERPRET_INPUT_AS_3D)

6811

,

6812

uint src_cross_plane_pad

6813

#endif // REINTERPRET_INPUT_AS_3D

6814

#if defined(REINTERPRET_OUTPUT_AS_3D)

6815

,

6816

uint dst_cross_plane_pad

6817

#endif // REINTERPRET_OUTPUT_AS_3D

6818

)

6819

{

6820

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6821

6822

// Compute starting address for matrix A and Matrix B

6823

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6824

6825

// Update address for the matrix A

6826

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6827

6828

// Update address for the matrix B

6829

src_addr.s1 += idx * sizeof(half);

6830

6831

#if defined(REINTERPRET_INPUT_AS_3D)

6832

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6833

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6839

// |******************|

6840

// | cross_plane_pad |

6841

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6846

6847

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6848

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6849

zin = min(DEPTH_GEMM3D - 1, zin);

6850

6851

// Add offset due to the cross plane paddings

6852

zin *= (src_cross_plane_pad * src0_stride_y);

6853

6854

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6855

// multiply src0_stride_z by DEPTH_GEMM3D

6856

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6857

6858

#else // defined(REINTERPRET_INPUT_AS_3D)

6859

6860

// Add offset for batched GEMM

6861

src_addr.s0 += get_global_id(2) * src0_stride_z;

6862

6863

#endif // defined(REINTERPRET_INPUT_AS_3D)

6864

6865

#if defined(MATRIX_B_DEPTH)

6866

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6867

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6868

#else // defined(MATRIX_B_DEPTH)

6869

src_addr.s1 += get_global_id(2) * src1_stride_z;

6870

#endif // defined(MATRIX_B_DEPTH)

6871

6872

float8 acc0 = 0.0h;

6873

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6874

float8 acc1 = 0.0h;

6875

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6877

float8 acc2 = 0.0h;

6878

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6879

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6880

float8 acc3 = 0.0h;

6881

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6882

6883

int i = 0;

6884

for(; i <= ((int)COLS_A - 4); i += 4)

6885

{

6886

#if defined(REINTERPRET_INPUT_AS_3D)

6887

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6888

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6889

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6890

// Load values from matrix A

6891

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6892

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6893

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6894

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6895

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6896

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6897

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6898

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6899

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6900

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6901

#endif // defined(REINTERPRET_INPUT_AS_3D)

6902

6903

// Load values from matrix B

6904

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6905

src_addr.s1 += src1_stride_y;

6906

6907

// Accumulate

6908

acc0 = fma(b0, (float8)a0.s0, acc0);

6909

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6910

acc1 = fma(b0, (float8)a1.s0, acc1);

6911

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6912

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6913

acc2 = fma(b0, (float8)a2.s0, acc2);

6914

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6915

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6916

acc3 = fma(b0, (float8)a3.s0, acc3);

6917

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6918

6919

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6920

src_addr.s1 += src1_stride_y;

6921

acc0 = fma(b0, (float8)a0.s1, acc0);

6922

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6923

acc1 = fma(b0, (float8)a1.s1, acc1);

6924

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6925

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6926

acc2 = fma(b0, (float8)a2.s1, acc2);

6927

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6928

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6929

acc3 = fma(b0, (float8)a3.s1, acc3);

6930

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6931

6932

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6933

src_addr.s1 += src1_stride_y;

6934

acc0 = fma(b0, (float8)a0.s2, acc0);

6935

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6936

acc1 = fma(b0, (float8)a1.s2, acc1);

6937

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6938

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6939

acc2 = fma(b0, (float8)a2.s2, acc2);

6940

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6941

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6942

acc3 = fma(b0, (float8)a3.s2, acc3);

6943

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6944

6945

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6946

src_addr.s1 += src1_stride_y;

6947

acc0 = fma(b0, (float8)a0.s3, acc0);

6948

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6949

acc1 = fma(b0, (float8)a1.s3, acc1);

6950

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6951

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6952

acc2 = fma(b0, (float8)a2.s3, acc2);

6953

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6954

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6955

acc3 = fma(b0, (float8)a3.s3, acc3);

6956

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6957

6958

src_addr.s0 += 4 * sizeof(half);

6959

}

6960

6961

for(; i < (int)COLS_A; ++i)

6962

{

6963

#if defined(REINTERPRET_INPUT_AS_3D)

6964

// Load values from matrix A

6965

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6966

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6967

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6968

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6969

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6970

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6971

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6972

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6973

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6974

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6975

#else // defined(REINTERPRET_INPUT_AS_3D)

6976

// Load values from matrix A

6977

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6978

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6979

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6980

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6981

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6982

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6983

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6984

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6985

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6986

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6987

#endif // defined(REINTERPRET_INPUT_AS_3D)

6988

6989

// Load values from matrix B

6990

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6991

6992

src_addr += (int2)(sizeof(half), src1_stride_y);

6993

6994

// Accumulate

6995

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

6996

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6997

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

6998

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6999

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7000

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

7001

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7002

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7003

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

7004

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7005

}

7006

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7007

int z = get_global_id(2);

7008

7009

// Compute destination address

7010

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7011

7012

// Compute dst address

7013

__global uchar *dst_addr = offset(&dst, 0, 0);

7014

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7015

uint4 zout = 0;

7016

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7017

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7018

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7019

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

7020

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7026

// |******************|

7027

// | cross_plane_pad |

7028

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7033

7034

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7035

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7036

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7037

7038

// Add offset due to the cross plane paddings

7039

zout *= (dst_cross_plane_pad * dst_stride_y);

7040

7041

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7042

// multiply dst_stride_z by DEPTH_GEMM3D

7043

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7044

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7045

// Add offset for batched GEMM

7046

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7047

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7048

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7049

// Multiply by the weight of matrix-matrix product and store the result

7050

#if defined(ALPHA)

7051

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

7052

#endif // defined(ALPHA)

7053

7054

#if defined(BETA)

7055

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7056

7057

#if defined(BROADCAST_BIAS)

7058

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7059

7060

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7061

7062

float8 bias_f0 = convert_float8(bias0);

7063

7064

#ifndef UNIT_BETA

7065

SCALE_BLOCK(1, float, bias_f, BETA);

7066

#endif // UNIT_BIAS

7067

7068

// acc = acc + bias[broadcasted]

7069

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

7070

7071

#else // defined(BROADCAST_BIAS)

7072

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7073

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7074

7075

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7076

7077

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7078

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7079

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7080

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7081

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7082

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7083

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7084

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7085

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7086

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7087

7088

#ifndef UNIT_BETA

7089

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

7094

7095

#endif // defined(BROADCAST_BIAS)

7096

#endif // defined(BETA)

7097

7098

half8 acc_h0 = convert_half8(acc0);

7099

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7100

half8 acc_h1 = convert_half8(acc1);

7101

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7102

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7103

half8 acc_h2 = convert_half8(acc2);

7104

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7105

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7106

half8 acc_h3 = convert_half8(acc3);

7107

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7108

7109

#if defined(ACTIVATION_TYPE)

7110

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

7111

#endif // defined(ACTIVATION_TYPE)

7112

7113

// Store the output block

7114

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7115

}

7116

7117

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

7118

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7119

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

7120

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

7121

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

7122

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

7123

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7124

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

7125

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7126

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7127

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

7128

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7129

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

7130

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7131

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

7132

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

7133

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

7134

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

7135

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7136

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

7137

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7138

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7139

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7140

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7141

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

7142

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

7143

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7144

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7145

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7146

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7147

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7148

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

7149

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

7150

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

7151

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

7152

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

7153

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7154

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

7155

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7156

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7157

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7158

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7159

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7160

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

7161

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7162

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7163

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7164

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

7165

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7166

*/

7167

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

7168

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7169

#if defined(BETA)

7170

IMAGE_DECLARATION(src2),

7171

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7172

IMAGE_DECLARATION(dst),

7173

uint src0_stride_z,

7174

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7175

#if defined(BETA)

7176

uint src2_stride_z,

7177

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7178

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7179

#if defined(REINTERPRET_INPUT_AS_3D)

7180

,

7181

uint src_cross_plane_pad

7182

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7183

#if defined(REINTERPRET_OUTPUT_AS_3D)

7184

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7185

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7186

#endif // REINTERPRET_OUTPUT_AS_3D

7187

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7188

{

7189

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

7190

7191

// Compute starting address for matrix A and Matrix B

7192

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

7193

7194

// Update address for the matrix A

7195

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

7196

7197

// Update address for the matrix B

7198

src_addr.s1 += idx * sizeof(half);

7199

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7200

#if defined(REINTERPRET_INPUT_AS_3D)

7201

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

7202

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7208

// |******************|

7209

// | cross_plane_pad |

7210

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7215

7216

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

7217

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7218

zin = min(DEPTH_GEMM3D - 1, zin);

7219

7220

// Add offset due to the cross plane paddings

7221

zin *= (src_cross_plane_pad * src0_stride_y);

7222

7223

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7224

// multiply src0_stride_z by DEPTH_GEMM3D

7225

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

7226

7227

#else // defined(REINTERPRET_INPUT_AS_3D)

7228

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7229

// Add offset for batched GEMM

7230

src_addr.s0 += get_global_id(2) * src0_stride_z;

7231

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7232

#endif // defined(REINTERPRET_INPUT_AS_3D)

7233

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7234

#if defined(MATRIX_B_DEPTH)

7235

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

7236

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

7237

#else // defined(MATRIX_B_DEPTH)

7238

src_addr.s1 += get_global_id(2) * src1_stride_z;

7239

#endif // defined(MATRIX_B_DEPTH)

7240

7241

half8 acc0 = 0.0h;

7242

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7243

half8 acc1 = 0.0h;

7244

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7245

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7246

half8 acc2 = 0.0h;

7247

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7248

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7249

half8 acc3 = 0.0h;

7250

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7251

7252

int i = 0;

7253

for(; i <= ((int)COLS_A - 4); i += 4)

7254

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7255

#if defined(REINTERPRET_INPUT_AS_3D)

7256

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7257

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

7258

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7259

// Load values from matrix A

7260

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7261

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7262

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7264

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7265

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7266

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7267

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7268

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7269

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7270

#endif // defined(REINTERPRET_INPUT_AS_3D)

7271

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7272

// Load values from matrix B

7273

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7274

src_addr.s1 += src1_stride_y;

7275

7276

// Accumulate

7277

acc0 = fma(b0, (half8)a0.s0, acc0);

7278

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7279

acc1 = fma(b0, (half8)a1.s0, acc1);

7280

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7281

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7282

acc2 = fma(b0, (half8)a2.s0, acc2);

7283

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7284

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7285

acc3 = fma(b0, (half8)a3.s0, acc3);

7286

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7287

7288

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7289

src_addr.s1 += src1_stride_y;

7290

acc0 = fma(b0, (half8)a0.s1, acc0);

7291

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7292

acc1 = fma(b0, (half8)a1.s1, acc1);

7293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7295

acc2 = fma(b0, (half8)a2.s1, acc2);

7296

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7297

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7298

acc3 = fma(b0, (half8)a3.s1, acc3);

7299

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7300

7301

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7302

src_addr.s1 += src1_stride_y;

7303

acc0 = fma(b0, (half8)a0.s2, acc0);

7304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7305

acc1 = fma(b0, (half8)a1.s2, acc1);

7306

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7307

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7308

acc2 = fma(b0, (half8)a2.s2, acc2);

7309

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7310

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7311

acc3 = fma(b0, (half8)a3.s2, acc3);

7312

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7313

7314

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7315

src_addr.s1 += src1_stride_y;

7316

acc0 = fma(b0, (half8)a0.s3, acc0);

7317

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7318

acc1 = fma(b0, (half8)a1.s3, acc1);

7319

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7320

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7321

acc2 = fma(b0, (half8)a2.s3, acc2);

7322

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7323

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7324

acc3 = fma(b0, (half8)a3.s3, acc3);

7325

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7326

7327

src_addr.s0 += 4 * sizeof(half);

7328

}

7329

7330

for(; i < (int)COLS_A; ++i)

7331

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7332

#if defined(REINTERPRET_INPUT_AS_3D)

7333

// Load values from matrix A

7334

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7335

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7336

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7337

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7338

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7339

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7341

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7342

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7343

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7344

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7345

// Load values from matrix A

7346

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7347

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7348

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7351

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7352

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7353

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7354

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7355

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7356

#endif // defined(REINTERPRET_INPUT_AS_3D)

7357

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7358

// Load values from matrix B

7359

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7360

7361

src_addr += (int2)(sizeof(half), src1_stride_y);

7362

7363

// Accumulate

7364

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

7365

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7366

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

7367

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7368

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7369

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

7370

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7371

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7372

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

7373

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7374

}

7375

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7376

int z = get_global_id(2);

7377

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7378

// Compute destination address

7379

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7380

7381

// Compute dst address

7382

__global uchar *dst_addr = offset(&dst, 0, 0);

7383

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7384

uint4 zout = 0;

7385

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7386

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7387

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7388

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7389

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7390

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

7395

// |******************|

7396

// | cross_plane_pad |

7397

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7402

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7403

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7404

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7405

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7406

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7407

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7408

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7409

7410

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7411

// multiply dst_stride_z by DEPTH_GEMM3D

7412

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7413

#else // defined(REINTERPRET_OUTPUT_AS_3D)

7414

// Add offset for batched GEMM

7415

dst_addr += z * dst_stride_z;

7416

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

7417

7418

// Multiply by the weight of matrix-matrix product and store the result

7419

#if defined(ALPHA)

7420

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

7421

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7426

7427

#if defined(BROADCAST_BIAS)

7428

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7429

7430

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7431

7432

#ifndef UNIT_BETA

7433

SCALE_BLOCK(1, half, bias, BETA);

7434

#endif // UNIT_BIAS

7435

7436

// acc = acc + bias[broadcasted]

7437

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

7438

7439

#else // defined(BROADCAST_BIAS)

7440

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7441

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7442

7443

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7444

7445

#ifndef UNIT_BETA

7446

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

7451

7452

#endif // defined(BROADCAST_BIAS)

7453

#endif // defined(BETA)

7454

7455

#if defined(ACTIVATION_TYPE)

7456

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

7457

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7458

7459

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7460

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7461

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

7462

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7463

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

7464

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7465

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7466

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7467

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7468

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7469

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7470

*

7471

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

7472

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7473

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7474

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7475

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7476

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7477

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7478

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7479

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7480

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7481

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7482

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7483

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7484

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7485

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7486

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7487

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7488

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

7489

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7490

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7491

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7492

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7493

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7494

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7495

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7496

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

7497

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7498

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7499

float4 c = vload4(0, (__global float *)src.ptr);

7500

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7501

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7502

float4 out = alpha_ab + (float4)BETA * c;

7503

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7504

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7505

vstore4(out, 0, (__global float *)dst.ptr);

7506

}

7507

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7508

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7509

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7510

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7511

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7512

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7513

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

7514

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7515

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7516

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7517

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7518

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7519

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7520

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7521

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7522

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7523

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7524

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7525

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7526

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7527

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7528

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7529

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7530

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

7531

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7532

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7533

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7534

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7535

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7536

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7537

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7538

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

7539

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7540

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7541

half8 c = vload8(0, (__global half *)src.ptr);

7542

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7543

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7544

half8 out = alpha_ab + (half8)BETA * c;

7545

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7546

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7547

vstore8(out, 0, (__global half *)dst.ptr);

7548

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7549

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7550

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7551

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7552

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7553

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

7554

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7555

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7556

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7557

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7558

*

7559

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

7560

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7561

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7562

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7563

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7564

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7565

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7566

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7567

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7568

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7569

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7570

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

7571

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

7572

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7573

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7574

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7575

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7576

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7577

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7578

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7579

*/

7580

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

7581

TENSOR3D_DECLARATION(src1),

7582

IMAGE_DECLARATION(dst))

7583

{

7584

int idx = get_global_id(0) * 4;

7585

int idy = get_global_id(1);

7586

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7587

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7588

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

7589

src_addr.s1 += idx * sizeof(float);

7590

7591

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

7595

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7596

{

7597

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

7598

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7599

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

7600

7601

acc += b0 * (float4)a0.s0;

7602

acc += b1 * (float4)a0.s1;

7603

}

7604

7605

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

7606

{

7607

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

7608

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7609

7610

acc += b0 * (float4)a0;

7611

}

7612

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7613

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7614

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7615

7616

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

7617

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7618

#endif // defined(WIDTH_VECTOR_A)

7619

7620

/** This kernel accumulates each row with the biases vector.

7621

*

7622

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.

7623

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.

7624

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

7625

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7626

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

7627

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

7628

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

7629

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7630

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

7631

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

7632

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

7633

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

7634

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

7635

*/

7636

#if defined(DATA_TYPE) && defined(VECTOR_SIZE)

7637

__kernel void gemm_accumulate_biases(

7638

IMAGE_DECLARATION(accum),

7639

VECTOR_DECLARATION(biases))

7640

{

7641

Image accum = CONVERT_TO_IMAGE_STRUCT(accum);

7642

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

7643

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7644

// Vector size, e.g. number of vector elements.

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7645

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

7646

accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);

7647

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

7648

biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

7649

accum_value = biases_value + accum_value;

Anton Lokhmotov