Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2019-07-19 09:54:47 +0100

[diff] [blame^]

49

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

50

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

51

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

52

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

53

* @note Only the following values for M0, K0 and V0 are supported:

54

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

55

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

56

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

57

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

58

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

59

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

60

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

61

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

62

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

63

*

64

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

65

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

66

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

67

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

68

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

69

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

70

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

71

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

72

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

73

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

74

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

75

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

76

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

77

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

78

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

79

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

80

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

81

*/

82

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

83

TENSOR3D_DECLARATION(dst)

84

#if defined(REINTERPRET_INPUT_AS_3D)

85

,

86

uint cross_plane_pad

87

#endif // REINTERPRET_INPUT_AS_3D

88

)

89

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

90

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

91

#define BLOCK_SIZE ((M0) * (K0))

92

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

93

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

#if defined(INTERLEAVE)

95

#define OUTPUT_OFFSET_X (K0)

96

#else // defined(INTERLEAVE)

97

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

98

#endif // defined(INTERLEAVE)

99

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

100

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

101

#if defined(INTERLEAVE)

102

#define OUTPUT_STEP_X (K0) * (V0)

103

#else // Do not interleave

104

#define OUTPUT_STEP_X (K0)

105

#endif // defined(INTERLEAVE)

106

107

// Compute source and destination addresses

108

uint x = get_global_id(0);

109

uint y = get_global_id(1);

110

uint z = get_global_id(2);

111

112

// ------------------ Compute input/output addresses ---------------------------

113

114

// Compute the input address

115

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

116

117

// Compute the output address

118

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

119

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

120

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

121

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

122

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

124

#if defined(REINTERPRET_INPUT_AS_3D)

125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

126

// multiply src_stride_z by DEPTH_GEMM3D

127

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

128

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

129

130

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

131

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

132

133

#else // defined(REINTERPRET_INPUT_AS_3D)

134

135

input_ptr += z * (uint)src_stride_z;

136

137

#endif // defined(REINTERPRET_INPUT_AS_3D)

138

139

// Add offset for batched GEMM

140

output_ptr += z * (uint)dst_stride_z;

141

142

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

143

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

144

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

145

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

146

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

147

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

148

#endif // M0 > 1

149

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

150

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

151

#endif // M0 > 2

152

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

153

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

154

#endif // M0 > 3

155

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

156

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

#endif // M0 > 4

158

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

159

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

160

#endif // M0 > 5

161

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

162

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

163

#endif // M0 > 6

164

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

165

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

166

#endif // M0 > 7

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

167

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

168

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

169

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

170

171

#undef BLOCK_SIZE

172

#undef OUTPUT_OFFSET_X

173

#undef OUTPUT_STEP_X

174

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

175

176

#if M0 == 2

177

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

178

({ \

179

VEC_DATA_TYPE(DATA_TYPE, M0) \

180

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

181

VSTORE(M0) \

182

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

183

})

184

#elif M0 == 3 // M0 == 3

185

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

186

({ \

187

VEC_DATA_TYPE(DATA_TYPE, M0) \

188

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

189

VSTORE(M0) \

190

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

191

})

192

#elif M0 == 4 // M0 == 4

193

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

194

({ \

195

VEC_DATA_TYPE(DATA_TYPE, M0) \

196

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

197

VSTORE(M0) \

198

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

199

})

200

#elif M0 == 5 // M0 == 5

201

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

202

({ \

203

VEC_DATA_TYPE(DATA_TYPE, 4) \

204

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

205

DATA_TYPE res1 = a4.s##i; \

206

VSTORE(4) \

207

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

208

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

209

})

210

#elif M0 == 6 // M0 == 6

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

VEC_DATA_TYPE(DATA_TYPE, 2) \

216

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

217

VSTORE(4) \

218

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

219

VSTORE(2) \

220

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

221

})

222

#elif M0 == 7 // M0 == 7

223

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

224

({ \

225

VEC_DATA_TYPE(DATA_TYPE, 4) \

226

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

227

VEC_DATA_TYPE(DATA_TYPE, 3) \

228

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

229

VSTORE(4) \

230

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

231

VSTORE(3) \

232

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

233

})

234

#elif M0 == 8 // M0 == 8

235

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

236

({ \

237

VEC_DATA_TYPE(DATA_TYPE, M0) \

238

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

239

VSTORE(M0) \

240

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

})

242

#else // M0 not supported

243

#error "M0 value not supported"

244

#endif // N0 conditions

245

246

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

247

* the output matrix unrolling the values.

248

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

249

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

250

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

251

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

252

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

253

* @note Only the following values for M0, K0 and V0 are supported:

254

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

255

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

256

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

257

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

258

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

259

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

260

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

261

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

262

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

263

*

264

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

265

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

266

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

267

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

268

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

269

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

270

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

271

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

272

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

273

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

274

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

275

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

276

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

277

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

278

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

279

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

280

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

281

*/

282

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

283

TENSOR3D_DECLARATION(dst)

284

#if defined(REINTERPRET_INPUT_AS_3D)

285

,

286

uint cross_plane_pad

287

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

292

293

// Output offset X

294

#if defined(INTERLEAVE)

295

#define OUTPUT_OFFSET_X (M0)

296

#else // defined(INTERLEAVE)

297

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

298

#endif // defined(INTERLEAVE)

299

300

// Output step X

301

#if defined(INTERLEAVE)

302

#define OUTPUT_STEP_X (M0) * (V0)

303

#else // Do not interleave

304

#define OUTPUT_STEP_X (M0)

305

#endif // defined(INTERLEAVE)

306

307

// Compute source and destination addresses

308

uint x = get_global_id(0);

309

uint y = get_global_id(1);

310

uint z = get_global_id(2);

311

312

// ------------------ Compute input/output addresses ---------------------------

313

314

// Compute the input address

315

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

316

317

// Compute the output address

318

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

319

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

320

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

321

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

322

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

323

324

#if defined(REINTERPRET_INPUT_AS_3D)

325

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

326

// multiply src_stride_z by DEPTH_GEMM3D

327

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

328

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

329

330

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

331

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

332

333

#else // defined(REINTERPRET_INPUT_AS_3D)

334

335

input_ptr += z * (uint)src_stride_z;

336

337

#endif // defined(REINTERPRET_INPUT_AS_3D)

338

339

// Add offset for batched GEMM

340

output_ptr += z * (uint)dst_stride_z;

341

342

// ---------------------------Load input values --------------------------------

343

344

// Load values from the LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

345

LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

346

BOUNDARY_CONDITION_X(x, a0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

347

#if M0 > 1

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

348

BOUNDARY_CONDITION_X(x, a1);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

349

#endif // M0 > 1

350

#if M0 > 2

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

351

BOUNDARY_CONDITION_X(x, a2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

352

#endif // M0 > 2

353

#if M0 > 3

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

354

BOUNDARY_CONDITION_X(x, a3);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

355

#endif // M0 > 3

356

#if M0 > 4

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

357

BOUNDARY_CONDITION_X(x, a4);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

358

#endif // M0 > 4

359

#if M0 > 5

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

360

BOUNDARY_CONDITION_X(x, a5);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

361

#endif // M0 > 5

362

#if M0 > 6

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

363

BOUNDARY_CONDITION_X(x, a6);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

364

#endif // M0 > 6

365

#if M0 > 7

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

366

BOUNDARY_CONDITION_X(x, a7);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

367

#endif // M0 > 7

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

368

// ---------------------------Transpose and store block -----------------------

369

370

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

372

#if K0 > 2

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

374

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

375

#if K0 > 3

376

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

377

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

378

#if K0 > 4

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

383

#endif // K0 > 4

384

#if K0 > 8

385

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

386

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

387

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

388

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

389

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

390

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

391

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

392

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

397

#undef OUTPUT_STEP_X

398

}

Gian Marco Iodice

2019-01-21 17:14:31 +0000

[diff] [blame]

399

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

400

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

402

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

403

* the output matrix unrolling the values.

404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

405

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

406

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

407

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

408

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

409

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

410

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

411

* N0: 2,3,4,8,16

412

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

413

* H0: greater than 0

414

*

415

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

416

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

417

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

419

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

421

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

423

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

424

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

426

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

429

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

430

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

431

*/

432

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

433

TENSOR3D_DECLARATION(dst))

434

{

435

// Block size

436

#define BLOCK_SIZE ((K0) * (N0))

437

438

// Output offset X

439

#if defined(INTERLEAVE)

440

#define OUTPUT_OFFSET_X (N0)

441

#else // defined(INTERLEAVE)

442

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

443

#endif // defined(INTERLEAVE)

444

445

// Output step X

446

#if defined(INTERLEAVE)

447

#define OUTPUT_STEP_X (N0) * (H0)

448

#else // Do not interleave

449

#define OUTPUT_STEP_X (N0)

450

#endif // defined(INTERLEAVE)

451

452

// Compute source and destination addresses

453

uint x = get_global_id(0);

454

uint y = get_global_id(1);

455

uint z = get_global_id(2);

456

457

// ------------------ Compute input/output addresses ---------------------------

458

459

// Compute the input address

460

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

461

462

// Compute the output address

463

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

464

x / (uint)H0)

465

* (uint)dst_stride_y)

466

+ z * (uint)dst_stride_z;

467

468

// ---------------------------Load input values --------------------------------

469

Vidhya Sudhan Loganathan

2019-01-08 12:17:03 +0000

[diff] [blame]

470

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

471

472

// Load values from the RHS matrix

473

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

474

#if K0 > 1

475

if(y * (uint)K0 + 1 < SRC_HEIGHT)

476

{

477

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

482

{

483

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

484

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

485

#endif // K0 > 2

486

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

487

if(y * (uint)K0 + 3 < SRC_HEIGHT)

488

{

489

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

490

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

491

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

492

#if K0 > 4

493

if(y * (uint)K0 + 4 < SRC_HEIGHT)

494

{

495

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

496

}

497

if(y * (uint)K0 + 5 < SRC_HEIGHT)

498

{

499

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

500

}

501

if(y * (uint)K0 + 6 < SRC_HEIGHT)

502

{

503

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

504

}

505

if(y * (uint)K0 + 7 < SRC_HEIGHT)

506

{

507

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

508

}

509

#endif // K0 > 4

510

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

511

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

512

{

513

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

514

}

515

if(y * (uint)K0 + 9 < SRC_HEIGHT)

516

{

517

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

518

}

519

if(y * (uint)K0 + 10 < SRC_HEIGHT)

520

{

521

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

522

}

523

if(y * (uint)K0 + 11 < SRC_HEIGHT)

524

{

525

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

526

}

527

if(y * (uint)K0 + 12 < SRC_HEIGHT)

528

{

529

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

530

}

531

if(y * (uint)K0 + 13 < SRC_HEIGHT)

532

{

533

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

534

}

535

if(y * (uint)K0 + 14 < SRC_HEIGHT)

536

{

537

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

538

}

539

if(y * (uint)K0 + 15 < SRC_HEIGHT)

540

{

541

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

546

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

547

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

548

549

#undef BLOCK_SIZE

550

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

555

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

556

* the output matrix unrolling the values.

557

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

558

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

559

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

560

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

561

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

563

* @note The option -DTRANSPOSE must passed at compile time.

564

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

565

* N0: 2,3,4,8,16

566

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

567

* H0: greater than 0

568

*

569

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32

570

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

571

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

573

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

575

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

577

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

578

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

579

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

580

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

581

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

582

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

583

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

584

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

585

*/

586

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

587

TENSOR3D_DECLARATION(dst))

588

{

589

// Block size

590

#define BLOCK_SIZE ((K0) * (N0))

591

592

// Output offset X

593

#if defined(INTERLEAVE)

594

#define OUTPUT_OFFSET_X (K0)

595

#else // defined(INTERLEAVE)

596

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

597

#endif // defined(INTERLEAVE)

598

599

// Output step X

600

#if defined(INTERLEAVE)

601

#define OUTPUT_STEP_X (K0) * (H0)

602

#else // Do not interleave

603

#define OUTPUT_STEP_X (K0)

604

#endif // defined(INTERLEAVE)

605

606

// Compute source and destination addresses

607

uint x = get_global_id(0);

608

uint y = get_global_id(1);

609

uint z = get_global_id(2);

610

611

// ------------------ Compute input/output addresses ---------------------------

612

613

// Compute the input address

614

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

615

616

// Compute the output address

617

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

618

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

619

620

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

2019-01-08 12:17:03 +0000

[diff] [blame]

621

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

623

// Load values from the RHS matrix

624

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

625

if(y * (uint)K0 + 1 < SRC_HEIGHT)

626

{

627

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

628

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

629

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

630

if(y * (uint)K0 + 2 < SRC_HEIGHT)

631

{

632

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

633

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

634

#endif // K0 > 2

635

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

636

if(y * (uint)K0 + 3 < SRC_HEIGHT)

637

{

638

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

639

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

640

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

641

#if K0 > 4

642

if(y * (uint)K0 + 4 < SRC_HEIGHT)

643

{

644

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

645

}

646

if(y * (uint)K0 + 5 < SRC_HEIGHT)

647

{

648

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

649

}

650

if(y * (uint)K0 + 6 < SRC_HEIGHT)

651

{

652

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

653

}

654

if(y * (uint)K0 + 7 < SRC_HEIGHT)

655

{

656

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

657

}

658

#endif // K0 > 4

659

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

660

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

661

{

662

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

663

}

664

if(y * (uint)K0 + 9 < SRC_HEIGHT)

665

{

666

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

667

}

668

if(y * (uint)K0 + 10 < SRC_HEIGHT)

669

{

670

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

671

}

672

if(y * (uint)K0 + 11 < SRC_HEIGHT)

673

{

674

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

675

}

676

if(y * (uint)K0 + 12 < SRC_HEIGHT)

677

{

678

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

679

}

680

if(y * (uint)K0 + 13 < SRC_HEIGHT)

681

{

682

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

683

}

684

if(y * (uint)K0 + 14 < SRC_HEIGHT)

685

{

686

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

687

}

688

if(y * (uint)K0 + 15 < SRC_HEIGHT)

689

{

690

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

2019-01-08 12:17:03 +0000

[diff] [blame]

695

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

696

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

697

#if K0 == 2

698

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

704

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

705

#if N0 > 2

706

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

707

#endif // N0 > 2

708

#if N0 > 3

709

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

710

#endif // N0 > 3

711

#if N0 > 4

712

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

713

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

714

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

715

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

716

#endif // N0 > 4

717

#if N0 > 8

718

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

719

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

720

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

721

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

722

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

723

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

724

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

725

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

726

#endif // N0 > 8

727

728

#elif K0 == 3 // K0 == 2

729

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

734

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

735

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

736

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

737

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

738

#endif // N0 > 2

739

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

740

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

741

#endif // N0 > 3

742

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

743

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

744

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

745

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

746

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

747

#endif // N0 > 4

748

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

749

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

750

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

751

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

752

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

753

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

754

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

755

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

756

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

757

#endif // N0 > 8

758

759

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

760

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

766

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

767

#if N0 > 2

768

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

769

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

770

#if N0 > 3

771

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

772

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

773

#if N0 > 4

774

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

775

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

776

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

777

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

778

#endif // N0 > 4

779

#if N0 > 8

780

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

781

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

782

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

783

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

784

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

785

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

786

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

787

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

788

#endif // N0 > 8

789

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

790

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

791

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

796

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

797

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

798

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

799

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

800

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

801

#if N0 > 3

802

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

803

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

804

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

805

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

806

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

807

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

808

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

809

#endif // N0 > 4

810

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

811

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

812

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

813

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

814

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

815

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

816

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

817

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

818

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

819

#endif // N0 > 8

820

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

821

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

822

823

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

829

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

830

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

831

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

832

#if N0 > 2

833

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

834

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

835

#endif // N0 > 2

836

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

837

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

838

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

839

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

840

#if N0 > 4

841

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

842

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

843

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

844

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

845

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

846

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

847

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

848

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

849

#endif // N0 > 4

850

#if N0 > 8

851

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

852

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

853

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

854

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

855

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

856

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

857

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

858

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

859

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

860

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

861

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

862

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

863

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

864

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

865

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

866

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

871

#endif // N0 > 2

872

873

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

874

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

875

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

876

877

#undef BLOCK_SIZE

878

#undef OUTPUT_OFFSET_X

879

#undef OUTPUT_STEP_X

880

}

881

#endif // defined(TRANSPOSE)

882

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

883

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

884

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

885

886

#define CONCAT(a, b) a##b

887

888

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

893

({ \

894

c = fma(a.s0, b.s0, c); \

895

c = fma(a.s1, b.s1, c); \

896

})

897

#define ARM_DOT3(a, b, c) \

898

({ \

899

ARM_DOT2(a, b, c); \

900

c = fma((a.s2), (b.s2), c); \

901

})

902

#define ARM_DOT4(a, b, c) \

903

({ \

904

ARM_DOT3(a, b, c); \

905

c = fma((a.s3), (b.s3), c); \

906

})

907

#define ARM_DOT8(a, b, c) \

908

({ \

909

ARM_DOT4((a.lo), (b.lo), c); \

910

ARM_DOT4((a.hi), (b.hi), c); \

911

})

912

#define ARM_DOT16(a, b, c) \

913

({ \

914

ARM_DOT8((a.lo), (b.lo), c); \

915

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

})

926

#elif N0 == 3 // N0 == 3

927

#define ARM_DOT_K0XN0(k0, a, b, c) \

928

({ \

929

CONCAT(ARM_DOT, k0) \

930

((a), (b##0), (c.s0)); \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##1), (c.s1)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##2), (c.s2)); \

935

})

936

#elif N0 == 4 // N0 == 4

937

#define ARM_DOT_K0XN0(k0, a, b, c) \

938

({ \

939

CONCAT(ARM_DOT, k0) \

940

((a), (b##0), (c.s0)); \

941

CONCAT(ARM_DOT, k0) \

942

((a), (b##1), (c.s1)); \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##2), (c.s2)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##3), (c.s3)); \

947

})

948

#elif N0 == 8 // N0 == 8

949

#define ARM_DOT_K0XN0(k0, a, b, c) \

950

({ \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##0), (c.s0)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##1), (c.s1)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##2), (c.s2)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##3), (c.s3)); \

959

CONCAT(ARM_DOT, k0) \

960

((a), (b##4), (c.s4)); \

961

CONCAT(ARM_DOT, k0) \

962

((a), (b##5), (c.s5)); \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##6), (c.s6)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##7), (c.s7)); \

967

})

968

#elif N0 == 16 // N0 == 16

969

#define ARM_DOT_K0XN0(k0, a, b, c) \

970

({ \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##0), (c.s0)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##1), (c.s1)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##2), (c.s2)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##3), (c.s3)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##4), (c.s4)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##5), (c.s5)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##6), (c.s6)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##7), (c.s7)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##8), (c.s8)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##9), (c.s9)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##A), (c.sA)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##B), (c.sB)); \

995

CONCAT(ARM_DOT, k0) \

996

((a), (b##C), (c.sC)); \

997

CONCAT(ARM_DOT, k0) \

998

((a), (b##D), (c.sD)); \

999

CONCAT(ARM_DOT, k0) \

1000

((a), (b##E), (c.sE)); \

1001

CONCAT(ARM_DOT, k0) \

1002

((a), (b##F), (c.sF)); \

1003

})

1004

#else // N0 not supported

1005

#error "N0 value not supported"

1006

#endif // N0 conditions

1007

1008

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1009

* The LHS matrix is NOT reshaped

1010

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1011

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1012

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1013

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1014

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1015

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1017

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1019

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1020

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1021

* - N0 = 2, 3, 4, 8, 16

1022

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1023

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1024

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1025

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1026

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1027

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1028

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1029

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1030

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1031

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1032

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1033

*

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1034

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1035

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1036

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1037

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1038

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1039

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1040

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1042

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1044

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1046

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1047

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1048

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1050

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1052

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1053

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1054

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1055

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1056

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1057

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1058

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1059

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1060

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1062

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1063

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

*/

1065

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1066

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

IMAGE_DECLARATION(bias),

1069

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

IMAGE_DECLARATION(dst),

1071

uint lhs_stride_z,

1072

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1073

#if defined(BETA)

1074

uint bias_stride_z,

1075

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1076

uint dst_stride_z

1077

#if defined(REINTERPRET_INPUT_AS_3D)

1078

,

1079

uint lhs_cross_plane_pad

1080

#endif // REINTERPRET_INPUT_AS_3D

1081

#if defined(REINTERPRET_OUTPUT_AS_3D)

1082

,

1083

uint dst_cross_plane_pad

1084

#endif // REINTERPRET_OUTPUT_AS_3D

1085

)

1086

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1087

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1088

#define RHS_BLOCK_SIZE ((K0) * (N0))

1089

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1090

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1091

#if defined(RHS_INTERLEAVE)

1092

#define RHS_OFFSET_X (K0)

1093

#define RHS_STEP_X ((K0) * (H0))

1094

#define RHS_STEP_LOOP (1)

1095

#else // defined(RHS_INTERLEAVE)

1096

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1097

#define RHS_STEP_X (K0)

1098

#define RHS_STEP_LOOP (H0)

1099

#endif // defined(RHS_INTERLEAVE)

1100

1101

uint x = get_global_id(0);

1102

uint y = get_global_id(1);

1103

uint z = get_global_id(2);

1104

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1105

#if defined(DUMMY_WORK_ITEMS)

1106

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1111

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1112

// Compute LHS matrix address

1113

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1114

1115

// Compute RHS matrix address

1116

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1117

1118

#if defined(MATRIX_B_DEPTH)

1119

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1120

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1121

#else // defined(MATRIX_B_DEPTH)

1122

rhs_offset += z * rhs_stride_z;

1123

#endif // defined(MATRIX_B_DEPTH)

1124

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1125

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1126

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1127

1128

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1129

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1130

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1131

1132

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1133

// multiply lhs_stride_z by DEPTH_GEMM3D

1134

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1135

1136

#else // defined(REINTERPRET_INPUT_AS_3D)

1137

1138

// Add offset for batched GEMM

1139

lhs_offset += z * lhs_stride_z;

1140

1141

#endif // defined(REINTERPRET_INPUT_AS_3D)

1142

1143

// Initialize the accumulators

1144

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1145

1146

int i = 0;

1147

for(; i <= (K - K0); i += K0)

1148

{

1149

// Supported cases (M0, K0):

1150

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1151

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1152

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1153

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1154

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1155

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1156

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1157

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1158

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1159

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1160

1161

// Load values from RHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1162

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1163

1164

// Accumulate

1165

ARM_DOT_K0XN0(K0, a0, b, c0);

1166

#if M0 > 1

1167

ARM_DOT_K0XN0(K0, a1, b, c1);

1168

#endif // M0 > 1

1169

#if M0 > 2

1170

ARM_DOT_K0XN0(K0, a2, b, c2);

1171

#endif // M0 > 2

1172

#if M0 > 3

1173

ARM_DOT_K0XN0(K0, a3, b, c3);

1174

#endif // M0 > 3

1175

#if M0 > 4

1176

ARM_DOT_K0XN0(K0, a4, b, c4);

1177

#endif // M0 > 4

1178

#if M0 > 5

1179

ARM_DOT_K0XN0(K0, a5, b, c5);

1180

#endif // M0 > 5

1181

#if M0 > 6

1182

ARM_DOT_K0XN0(K0, a6, b, c6);

1183

#endif // M0 > 6

1184

#if M0 > 7

1185

ARM_DOT_K0XN0(K0, a7, b, c7);

1186

#endif // M0 > 7

1187

1188

lhs_offset += K0 * sizeof(DATA_TYPE);

1189

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1190

}

1191

1192

// Left-over accumulations

1193

for(; i < K; ++i)

1194

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1195

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1196

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1197

1198

// Load values from RHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1199

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1200

1201

// Accumulate

1202

ARM_DOT_K0XN0(1, a0, b, c0);

1203

#if M0 > 1

1204

ARM_DOT_K0XN0(1, a1, b, c1);

1205

#endif // M0 > 1

1206

#if M0 > 2

1207

ARM_DOT_K0XN0(1, a2, b, c2);

1208

#endif // M0 > 2

1209

#if M0 > 3

1210

ARM_DOT_K0XN0(1, a3, b, c3);

1211

#endif // M0 > 3

1212

#if M0 > 4

1213

ARM_DOT_K0XN0(1, a4, b, c4);

1214

#endif // M0 > 4

1215

#if M0 > 5

1216

ARM_DOT_K0XN0(1, a5, b, c5);

1217

#endif // M0 > 5

1218

#if M0 > 6

1219

ARM_DOT_K0XN0(1, a6, b, c6);

1220

#endif // M0 > 6

1221

#if M0 > 7

1222

ARM_DOT_K0XN0(1, a7, b, c7);

1223

#endif // M0 > 7

1224

1225

lhs_offset += sizeof(DATA_TYPE);

1226

rhs_offset += sizeof(DATA_TYPE);

1227

}

1228

1229

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1230

1231

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1232

1233

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1234

1235

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1236

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1237

1238

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1239

// multiply dst_stride_z by DEPTH_GEMM3D

1240

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1241

1242

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1243

1244

// Add offset for batched GEMM

1245

dst_addr += z * dst_stride_z;

1246

1247

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1248

1249

// Multiply by the weight of matrix-matrix product and store the result

1250

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1251

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1252

#endif // defined(ALPHA)

1253

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1254

// Add beta*bias

1255

#if defined(BETA)

1256

#if defined(BROADCAST_BIAS)

1257

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1258

1259

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1260

1261

#ifndef UNIT_BETA

1262

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1263

#endif // UNIT_BIAS

1264

1265

// c = c + bias[broadcasted]

1266

ADD_BLOCK_BROADCAST(M0, c, bias0);

1267

1268

#else // defined(BROADCAST_BIAS)

1269

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1270

2) * bias_stride_z;

1271

1272

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1273

1274

#ifndef UNIT_BETA

1275

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1280

1281

#endif // defined(BROADCAST_BIAS)

1282

#endif // defined(BETA)

1283

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1284

#if defined(ACTIVATION_TYPE)

1285

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1286

#endif // defined(ACTIVATION_TYPE)

1287

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1288

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1289

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1290

1291

#undef RHS_BLOCK_SIZE

1292

#undef RHS_OFFSET_X

1293

#undef RHS_STEP_X

1294

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1295

1296

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1303

({ \

1304

VEC_DATA_TYPE(DATA_TYPE, N0) \

1305

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1306

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1307

})

1308

#elif M0 == 2 // M0 == 2

1309

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1310

({ \

1311

VEC_DATA_TYPE(DATA_TYPE, N0) \

1312

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1313

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1314

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1315

})

1316

#elif M0 == 3 // M0 == 3

1317

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1318

({ \

1319

VEC_DATA_TYPE(DATA_TYPE, N0) \

1320

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1321

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1322

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1323

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1324

})

1325

#elif M0 == 4 // M0 == 4

1326

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1327

({ \

1328

VEC_DATA_TYPE(DATA_TYPE, N0) \

1329

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1330

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1331

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1332

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1333

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1334

})

1335

#elif M0 == 5 // M0 == 5

1336

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1337

({ \

1338

VEC_DATA_TYPE(DATA_TYPE, N0) \

1339

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1340

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1341

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1342

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1343

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1344

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1345

})

1346

#elif M0 == 6 // M0 == 6

1347

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1348

({ \

1349

VEC_DATA_TYPE(DATA_TYPE, N0) \

1350

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1351

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1352

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1353

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1354

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1355

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1356

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1357

})

1358

#elif M0 == 7 // M0 == 7

1359

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1360

({ \

1361

VEC_DATA_TYPE(DATA_TYPE, N0) \

1362

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1363

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1364

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1365

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1366

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1367

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1368

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1369

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1370

})

1371

#elif M0 == 8 // M0 == 8

1372

#define LD_RHS_VFMA_M0xN0(i, a, c) \

1373

({ \

1374

VEC_DATA_TYPE(DATA_TYPE, N0) \

1375

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \

1376

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1377

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1378

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1379

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1380

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1381

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1382

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1383

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

1384

})

1385

#else // M0 not supported

1386

#error "M0 not supported"

1387

#endif // M0 not supported

1388

1389

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1390

* The LHS matrix is NOT reshaped

1391

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1392

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1393

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1394

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1395

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1396

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1397

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1398

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1399

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1400

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1401

* - N0 = 2, 3, 4, 8, 16

1402

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1403

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1404

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1405

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1406

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1407

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1408

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1409

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1410

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1411

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1412

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1413

*

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1414

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1415

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1416

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1417

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1418

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1419

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1420

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1421

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1422

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1423

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1424

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1425

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1426

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1427

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1428

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1429

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1430

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1431

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1432

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1433

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1434

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1435

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1436

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1437

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1438

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1439

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1440

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1441

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1442

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1443

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1444

*/

1445

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1446

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1447

#if defined(BETA)

1448

IMAGE_DECLARATION(bias),

1449

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1450

IMAGE_DECLARATION(dst),

1451

uint lhs_stride_z,

1452

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1453

#if defined(BETA)

1454

uint bias_stride_z,

1455

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1456

uint dst_stride_z

1457

#if defined(REINTERPRET_INPUT_AS_3D)

1458

,

1459

uint lhs_cross_plane_pad

1460

#endif // REINTERPRET_INPUT_AS_3D

1461

#if defined(REINTERPRET_OUTPUT_AS_3D)

1462

,

1463

uint dst_cross_plane_pad

1464

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1469

1470

// RHS offset and step X

1471

#if defined(RHS_INTERLEAVE)

1472

#define RHS_OFFSET_X (N0)

1473

#define RHS_STEP_X ((N0) * (H0))

1474

#define RHS_STEP_LOOP (1)

1475

#else // defined(RHS_INTERLEAVE)

1476

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1477

#define RHS_STEP_X (N0)

1478

#define RHS_STEP_LOOP (H0)

1479

#endif // defined(RHS_INTERLEAVE)

1480

1481

uint x = get_global_id(0);

1482

uint y = get_global_id(1);

1483

uint z = get_global_id(2);

1484

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1485

#if defined(DUMMY_WORK_ITEMS)

1486

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1491

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1492

// Compute LHS matrix address

1493

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1494

1495

// Compute RHS matrix address

1496

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1497

1498

#if defined(MATRIX_B_DEPTH)

1499

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1500

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1501

#else // defined(MATRIX_B_DEPTH)

1502

rhs_offset += z * rhs_stride_z;

1503

#endif // defined(MATRIX_B_DEPTH)

1504

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1505

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1506

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1507

1508

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1509

1510

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1511

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1512

1513

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1514

// multiply lhs_stride_z by DEPTH_GEMM3D

1515

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1516

1517

#else // defined(REINTERPRET_INPUT_AS_3D)

1518

1519

// Add offset for batched GEMM

1520

lhs_offset += z * lhs_stride_z;

1521

1522

#endif // defined(REINTERPRET_INPUT_AS_3D)

1523

1524

// Initialize the accumulators

1525

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1526

1527

int i = 0;

1528

for(; i <= (K - K0); i += K0)

1529

{

1530

// Supported cases (M0, K0):

1531

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1532

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1533

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1534

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1535

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1536

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1537

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1538

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1539

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1540

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1541

1542

LD_RHS_VFMA_M0xN0(0, a, c);

1543

LD_RHS_VFMA_M0xN0(1, a, c);

1544

#if K0 > 2

1545

LD_RHS_VFMA_M0xN0(2, a, c);

1546

#endif // K0 > 2

1547

#if K0 > 3

1548

LD_RHS_VFMA_M0xN0(3, a, c);

1549

#endif // K0 > 3

1550

#if K0 > 4

1551

LD_RHS_VFMA_M0xN0(4, a, c);

1552

LD_RHS_VFMA_M0xN0(5, a, c);

1553

LD_RHS_VFMA_M0xN0(6, a, c);

1554

LD_RHS_VFMA_M0xN0(7, a, c);

1555

#endif // K0 > 4

1556

#if K0 > 8

1557

LD_RHS_VFMA_M0xN0(8, a, c);

1558

LD_RHS_VFMA_M0xN0(9, a, c);

1559

LD_RHS_VFMA_M0xN0(A, a, c);

1560

LD_RHS_VFMA_M0xN0(B, a, c);

1561

LD_RHS_VFMA_M0xN0(C, a, c);

1562

LD_RHS_VFMA_M0xN0(D, a, c);

1563

LD_RHS_VFMA_M0xN0(E, a, c);

1564

LD_RHS_VFMA_M0xN0(F, a, c);

1565

#endif // K0 > 8

1566

1567

lhs_offset += K0 * sizeof(DATA_TYPE);

1568

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1569

}

1570

1571

// Left-over accumulations

1572

for(; i < K; ++i)

1573

{

1574

// Load values from LHS matrix

1575

VEC_DATA_TYPE(DATA_TYPE, 2)

1576

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1577

#if M0 > 1

1578

VEC_DATA_TYPE(DATA_TYPE, 2)

1579

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1580

#endif // M0 > 1

1581

#if M0 > 2

1582

VEC_DATA_TYPE(DATA_TYPE, 2)

1583

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1584

#endif // M0 > 2

1585

#if M0 > 3

1586

VEC_DATA_TYPE(DATA_TYPE, 2)

1587

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1588

#endif // M0 > 3

1589

#if M0 > 4

1590

VEC_DATA_TYPE(DATA_TYPE, 2)

1591

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1592

#endif // M0 > 4

1593

#if M0 > 5

1594

VEC_DATA_TYPE(DATA_TYPE, 2)

1595

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1596

#endif // M0 > 5

1597

#if M0 > 6

1598

VEC_DATA_TYPE(DATA_TYPE, 2)

1599

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1600

#endif // M0 > 6

1601

#if M0 > 7

1602

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1603

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1604

#endif // M0 > 7

1605

1606

LD_RHS_VFMA_M0xN0(0, a, c);

1607

1608

lhs_offset += sizeof(DATA_TYPE);

1609

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1610

}

1611

1612

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

1613

1614

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1615

1616

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1617

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1618

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1619

1620

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1621

// multiply dst_stride_z by DEPTH_GEMM3D

1622

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1623

1624

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1625

1626

// Add offset for batched GEMM

1627

dst_addr += z * dst_stride_z;

1628

1629

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1630

1631

// Multiply by the weight of matrix-matrix product and store the result

1632

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1633

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1634

#endif // defined(ALPHA)

1635

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1636

// Add beta*bias

1637

#if defined(BETA)

1638

#if defined(BROADCAST_BIAS)

1639

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1640

1641

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1642

1643

#ifndef UNIT_BETA

1644

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1645

#endif // UNIT_BIAS

1646

1647

// c = c + bias[broadcasted]

1648

ADD_BLOCK_BROADCAST(M0, c, bias0);

1649

1650

#else // defined(BROADCAST_BIAS)

1651

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1652

2) * bias_stride_z;

1653

1654

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1655

1656

#ifndef UNIT_BETA

1657

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1662

1663

#endif // defined(BROADCAST_BIAS)

1664

#endif // defined(BETA)

1665

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1666

#if defined(ACTIVATION_TYPE)

1667

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1668

#endif // defined(ACTIVATION_TYPE)

1669

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1670

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1671

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1672

1673

#undef RHS_BLOCK_SIZE

1674

#undef RHS_OFFSET_X

1675

#undef RHS_STEP_X

1676

}

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1677

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1678

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1679

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1680

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1681

#if K0 == 2

1682

#define ARM_DOT_K0(a, b, c) \

1683

({ \

1684

c = fma(a.s0, b.s0, c); \

1685

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1686

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1687

#elif K0 == 3 // K0 == 3

1688

#define ARM_DOT_K0(a, b, c) \

1689

({ \

1690

c = fma(a.s0, b.s0, c); \

1691

c = fma(a.s1, b.s1, c); \

1692

c = fma(a.s2, b.s2, c); \

1693

})

1694

#elif K0 == 4 // K0 == 4

1695

#define ARM_DOT_K0(a, b, c) \

1696

({ \

1697

c = fma(a.s0, b.s0, c); \

1698

c = fma(a.s1, b.s1, c); \

1699

c = fma(a.s2, b.s2, c); \

1700

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1701

})

1702

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1703

#define ARM_DOT_K0(a, b, c) \

1704

({ \

1705

c = fma(a.s0, b.s0, c); \

1706

c = fma(a.s1, b.s1, c); \

1707

c = fma(a.s2, b.s2, c); \

1708

c = fma(a.s3, b.s3, c); \

1709

c = fma(a.s4, b.s4, c); \

1710

c = fma(a.s5, b.s5, c); \

1711

c = fma(a.s6, b.s6, c); \

1712

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1713

})

1714

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1715

#define ARM_DOT_K0(a, b, c) \

1716

({ \

1717

c = fma(a.s0, b.s0, c); \

1718

c = fma(a.s1, b.s1, c); \

1719

c = fma(a.s2, b.s2, c); \

1720

c = fma(a.s3, b.s3, c); \

1721

c = fma(a.s4, b.s4, c); \

1722

c = fma(a.s5, b.s5, c); \

1723

c = fma(a.s6, b.s6, c); \

1724

c = fma(a.s7, b.s7, c); \

1725

c = fma(a.s8, b.s8, c); \

1726

c = fma(a.s9, b.s9, c); \

1727

c = fma(a.sA, b.sA, c); \

1728

c = fma(a.sB, b.sB, c); \

1729

c = fma(a.sC, b.sC, c); \

1730

c = fma(a.sD, b.sD, c); \

1731

c = fma(a.sE, b.sE, c); \

1732

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1733

})

1734

#else // K0 not supported

1735

#error "K0 value not supported"

1736

#endif // K0 conditions

1737

1738

#if N0 == 2

1739

#define ARM_DOT_K0XN0(a, b, c) \

1740

({ \

1741

ARM_DOT_K0((a), (b##0), (c.s0)); \

1742

ARM_DOT_K0((a), (b##1), (c.s1)); \

1743

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1744

#elif N0 == 3 // N0 == 3

1745

#define ARM_DOT_K0XN0(a, b, c) \

1746

({ \

1747

ARM_DOT_K0((a), (b##0), (c.s0)); \

1748

ARM_DOT_K0((a), (b##1), (c.s1)); \

1749

ARM_DOT_K0((a), (b##2), (c.s2)); \

1750

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1751

#elif N0 == 4 // N0 == 4

1752

#define ARM_DOT_K0XN0(a, b, c) \

1753

({ \

1754

ARM_DOT_K0((a), (b##0), (c.s0)); \

1755

ARM_DOT_K0((a), (b##1), (c.s1)); \

1756

ARM_DOT_K0((a), (b##2), (c.s2)); \

1757

ARM_DOT_K0((a), (b##3), (c.s3)); \

1758

})

1759

#elif N0 == 8 // N0 == 8

1760

#define ARM_DOT_K0XN0(a, b, c) \

1761

({ \

1762

ARM_DOT_K0((a), (b##0), (c.s0)); \

1763

ARM_DOT_K0((a), (b##1), (c.s1)); \

1764

ARM_DOT_K0((a), (b##2), (c.s2)); \

1765

ARM_DOT_K0((a), (b##3), (c.s3)); \

1766

ARM_DOT_K0((a), (b##4), (c.s4)); \

1767

ARM_DOT_K0((a), (b##5), (c.s5)); \

1768

ARM_DOT_K0((a), (b##6), (c.s6)); \

1769

ARM_DOT_K0((a), (b##7), (c.s7)); \

1770

})

1771

#elif N0 == 16 // N0 == 16

1772

#define ARM_DOT_K0XN0(a, b, c) \

1773

({ \

1774

ARM_DOT_K0((a), (b##0), (c.s0)); \

1775

ARM_DOT_K0((a), (b##1), (c.s1)); \

1776

ARM_DOT_K0((a), (b##2), (c.s2)); \

1777

ARM_DOT_K0((a), (b##3), (c.s3)); \

1778

ARM_DOT_K0((a), (b##4), (c.s4)); \

1779

ARM_DOT_K0((a), (b##5), (c.s5)); \

1780

ARM_DOT_K0((a), (b##6), (c.s6)); \

1781

ARM_DOT_K0((a), (b##7), (c.s7)); \

1782

ARM_DOT_K0((a), (b##8), (c.s8)); \

1783

ARM_DOT_K0((a), (b##9), (c.s9)); \

1784

ARM_DOT_K0((a), (b##A), (c.sA)); \

1785

ARM_DOT_K0((a), (b##B), (c.sB)); \

1786

ARM_DOT_K0((a), (b##C), (c.sC)); \

1787

ARM_DOT_K0((a), (b##D), (c.sD)); \

1788

ARM_DOT_K0((a), (b##E), (c.sE)); \

1789

ARM_DOT_K0((a), (b##F), (c.sF)); \

1790

})

1791

#else // N0 not supported

1792

#error "N0 value not supported"

1793

#endif // N0 conditions

1794

1795

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1796

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1797

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1798

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1799

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1800

* @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).

1801

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

1802

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

1803

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1804

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

1805

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1806

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1807

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1808

* - N0 = 2, 3, 4, 8, 16

1809

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1810

* - V0 >= 1

1811

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1812

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1813

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1814

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

1815

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1816

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1817

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1818

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1819

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

1820

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1821

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1822

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1823

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1824

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1825

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1826

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1827

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1828

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1829

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1830

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1831

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1832

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1833

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1834

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1835

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1836

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1837

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1838

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1839

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1840

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1841

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1842

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1843

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1844

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1845

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

1846

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1847

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1848

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1849

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1850

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1851

*/

1852

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

1853

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1854

#if defined(BETA)

1855

IMAGE_DECLARATION(bias),

1856

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1857

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1858

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1859

uint lhs_stride_z,

1860

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1861

#if defined(BETA)

1862

uint bias_stride_z,

1863

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1864

uint dst_stride_z

1865

#if defined(REINTERPRET_OUTPUT_AS_3D)

1866

,

1867

uint dst_cross_plane_pad

1868

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

1873

1874

#if defined(LHS_INTERLEAVE)

1875

#define LHS_OFFSET_X (K0)

1876

#define LHS_STEP_X ((K0) * (V0))

1877

#define LHS_STEP_LOOP (1)

1878

#else // defined(INTERLEAVE)

1879

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

1880

#define LHS_STEP_X (K0)

1881

#define LHS_STEP_LOOP (V0)

1882

#endif // defined(INTERLEAVE)

1883

1884

// Block size

1885

#define RHS_BLOCK_SIZE ((K0) * (N0))

1886

1887

// RHS offset and step X

1888

#if defined(RHS_INTERLEAVE)

1889

#define RHS_OFFSET_X (K0)

1890

#define RHS_STEP_X ((K0) * (H0))

1891

#define RHS_STEP_LOOP (1)

1892

#else // defined(RHS_INTERLEAVE)

1893

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1894

#define RHS_STEP_X (K0)

1895

#define RHS_STEP_LOOP (H0)

1896

#endif // defined(RHS_INTERLEAVE)

1897

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1898

#if defined(DUMMY_WORK_ITEMS)

1899

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1904

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1905

// Compute LHS matrix address

1906

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

1907

(get_global_id(2) * lhs_stride_z);

1908

1909

// Compute RHS matrix address

1910

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

1911

1912

#if defined(MATRIX_B_DEPTH)

1913

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1914

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

1915

#else // defined(MATRIX_B_DEPTH)

1916

rhs_addr += get_global_id(2) * rhs_stride_z;

1917

#endif // defined(MATRIX_B_DEPTH)

1918

1919

// Initialize the accumulators

Vidhya Sudhan Loganathan

2019-01-08 12:17:03 +0000

[diff] [blame]

1920

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1921

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1922

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

1923

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1924

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1925

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1926

{

1927

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1928

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1929

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1930

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1931

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1932

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1933

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1934

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1935

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1936

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1937

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1938

1939

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1940

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1941

1942

// Accumulate

1943

ARM_DOT_K0XN0(a0, b, c0);

1944

#if M0 > 1

1945

ARM_DOT_K0XN0(a1, b, c1);

1946

#endif // M0 > 1

1947

#if M0 > 2

1948

ARM_DOT_K0XN0(a2, b, c2);

1949

#endif // M0 > 2

1950

#if M0 > 3

1951

ARM_DOT_K0XN0(a3, b, c3);

1952

#endif // M0 > 3

1953

#if M0 > 4

1954

ARM_DOT_K0XN0(a4, b, c4);

1955

#endif // M0 > 4

1956

#if M0 > 5

1957

ARM_DOT_K0XN0(a5, b, c5);

1958

#endif // M0 > 5

1959

#if M0 > 6

1960

ARM_DOT_K0XN0(a6, b, c6);

1961

#endif // M0 > 6

1962

#if M0 > 7

1963

ARM_DOT_K0XN0(a7, b, c7);

1964

#endif // M0 > 7

1965

1966

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

1967

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1968

}

1969

1970

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

1971

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1972

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1973

1974

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1975

1976

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1977

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1978

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1979

// multiply dst_stride_z by DEPTH_GEMM3D

1980

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

1981

1982

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1983

1984

// Add offset for batched GEMM

1985

dst_addr += get_global_id(2) * dst_stride_z;

1986

1987

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1988

1989

// Multiply by the weight of matrix-matrix product and store the result

1990

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1991

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1992

#endif // defined(ALPHA)

1993

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1994

// Add beta*bias

1995

#if defined(BETA)

1996

#if defined(BROADCAST_BIAS)

1997

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1998

1999

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2000

2001

#ifndef UNIT_BETA

2002

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2003

#endif // UNIT_BIAS

2004

2005

// c = c + bias[broadcasted]

2006

ADD_BLOCK_BROADCAST(M0, c, bias0);

2007

2008

#else // defined(BROADCAST_BIAS)

2009

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2010

2) * bias_stride_z;

2011

2012

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2013

2014

#ifndef UNIT_BETA

2015

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2020

2021

#endif // defined(BROADCAST_BIAS)

2022

#endif // defined(BETA)

2023

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2024

#if defined(ACTIVATION_TYPE)

2025

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2026

#endif // defined(ACTIVATION_TYPE)

2027

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2028

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2029

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2030

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2031

#undef LHS_BLOCK_SIZE

2032

#undef LHS_OFFSET_X

2033

#undef LHS_STEP_X

2034

#undef RHS_BLOCK_SIZE

2035

#undef RHS_OFFSET_X

2036

#undef RHS_STEP_X

2037

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2038

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2039

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

2040

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2041

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

2042

2043

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

2050

({ \

2051

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2052

})

2053

#elif M0 == 2 // M0 == 2

2054

#define RHS_VFMA_M0xN0(i, a, b, c) \

2055

({ \

2056

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2057

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2058

})

2059

#elif M0 == 3 // M0 == 3

2060

#define RHS_VFMA_M0xN0(i, a, b, c) \

2061

({ \

2062

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2063

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2064

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2065

})

2066

#elif M0 == 4 // M0 == 4

2067

#define RHS_VFMA_M0xN0(i, a, b, c) \

2068

({ \

2069

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2070

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2071

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2072

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2073

})

2074

#elif M0 == 5 // M0 == 5

2075

#define RHS_VFMA_M0xN0(i, a, b, c) \

2076

({ \

2077

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2078

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2079

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2080

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2081

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2082

})

2083

#elif M0 == 6 // M0 == 6

2084

#define RHS_VFMA_M0xN0(i, a, b, c) \

2085

({ \

2086

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2087

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2088

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2089

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2090

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2091

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2092

})

2093

#elif M0 == 7 // M0 == 7

2094

#define RHS_VFMA_M0xN0(i, a, b, c) \

2095

({ \

2096

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2097

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2098

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2099

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2100

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2101

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2102

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

2103

})

2104

#elif M0 == 8 // M0 == 8

2105

#define RHS_VFMA_M0xN0(i, a, b, c) \

2106

({ \

2107

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

2108

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

2109

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

2110

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

2111

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

2112

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

2113

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

2114

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

2115

})

2116

#else // M0 not supported

2117

#error "M0 not supported"

2118

#endif // M0 not supported

2119

2120

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2121

* The LHS matrix is NOT reshaped

2122

* The RHS matrix is NOT reshaped

2123

*

2124

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2125

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

2126

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

2127

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2128

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

2129

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2130

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2131

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

2132

* - N0 = 2, 3, 4, 8, 16

2133

* - K0 = 2, 3, 4, 8, 16

2134

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2135

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2136

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2137

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2138

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2139

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2140

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2141

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2142

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2143

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2144

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

2145

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2146

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

2147

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2148

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

2149

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2150

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

2151

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

2152

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

2153

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

2154

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

2155

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2156

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2157

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2158

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2159

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2160

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2161

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2162

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2163

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2164

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2165

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2166

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2167

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2168

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2169

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

2170

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2171

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2172

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2173

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2174

*/

2175

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

2176

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2177

#if defined(BETA)

2178

IMAGE_DECLARATION(bias),

2179

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2180

IMAGE_DECLARATION(dst),

2181

uint lhs_stride_z,

2182

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2183

#if defined(BETA)

2184

uint bias_stride_z,

2185

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2186

uint dst_stride_z

2187

#if defined(REINTERPRET_INPUT_AS_3D)

2188

,

2189

uint lhs_cross_plane_pad

2190

#endif // REINTERPRET_INPUT_AS_3D

2191

#if defined(REINTERPRET_OUTPUT_AS_3D)

2192

,

2193

uint dst_cross_plane_pad

2194

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

2199

2200

// RHS offset and step X

2201

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2202

2203

uint x = get_global_id(0);

2204

uint y = get_global_id(1);

2205

uint z = get_global_id(2);

2206

2207

#if defined(DUMMY_WORK_ITEMS)

2208

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2213

2214

// Compute LHS matrix address

2215

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

2216

2217

// Compute RHS matrix address

2218

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

2219

2220

#if defined(MATRIX_B_DEPTH)

2221

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2222

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

2223

#else // defined(MATRIX_B_DEPTH)

2224

rhs_offset += z * rhs_stride_z;

2225

#endif // defined(MATRIX_B_DEPTH)

2226

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2227

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

2228

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2229

2230

#if defined(REINTERPRET_INPUT_AS_3D)

2231

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2232

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2233

2234

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2235

// multiply lhs_stride_z by DEPTH_GEMM3D

2236

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2237

2238

#else // defined(REINTERPRET_INPUT_AS_3D)

2239

2240

// Add offset for batched GEMM

2241

lhs_offset += z * lhs_stride_z;

2242

2243

#endif // defined(REINTERPRET_INPUT_AS_3D)

2244

2245

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2246

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2247

2248

int i = 0;

2249

for(; i <= (K - K0); i += K0)

2250

{

2251

// Supported cases (M0, K0):

2252

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2253

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2254

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2255

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2256

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2257

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2258

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2259

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

2260

// Load values from LHS matrix

2261

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

2262

2263

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2264

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2265

2266

RHS_VFMA_M0xN0(0, a, b0, c);

2267

RHS_VFMA_M0xN0(1, a, b1, c);

2268

#if K0 > 2

2269

RHS_VFMA_M0xN0(2, a, b2, c);

2270

#endif // K0 > 2

2271

#if K0 > 3

2272

RHS_VFMA_M0xN0(3, a, b3, c);

2273

#endif // K0 > 3

2274

#if K0 > 4

2275

RHS_VFMA_M0xN0(4, a, b4, c);

2276

RHS_VFMA_M0xN0(5, a, b5, c);

2277

RHS_VFMA_M0xN0(6, a, b6, c);

2278

RHS_VFMA_M0xN0(7, a, b7, c);

2279

#endif // K0 > 4

2280

#if K0 > 8

2281

RHS_VFMA_M0xN0(8, a, b8, c);

2282

RHS_VFMA_M0xN0(9, a, b9, c);

2283

RHS_VFMA_M0xN0(A, a, b10, c);

2284

RHS_VFMA_M0xN0(B, a, b11, c);

2285

RHS_VFMA_M0xN0(C, a, b12, c);

2286

RHS_VFMA_M0xN0(D, a, b13, c);

2287

RHS_VFMA_M0xN0(E, a, b14, c);

2288

RHS_VFMA_M0xN0(F, a, b15, c);

2289

#endif // K0 > 8

2290

2291

lhs_offset += K0 * sizeof(DATA_TYPE);

2292

rhs_offset += K0 * rhs_stride_y;

2293

}

2294

2295

// Left-over accumulations

2296

for(; i < K; ++i)

2297

{

2298

// Load values from LHS matrix

2299

VEC_DATA_TYPE(DATA_TYPE, 2)

2300

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

2301

#if M0 > 1

2302

VEC_DATA_TYPE(DATA_TYPE, 2)

2303

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

2304

#endif // M0 > 1

2305

#if M0 > 2

2306

VEC_DATA_TYPE(DATA_TYPE, 2)

2307

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

2308

#endif // M0 > 2

2309

#if M0 > 3

2310

VEC_DATA_TYPE(DATA_TYPE, 2)

2311

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

2312

#endif // M0 > 3

2313

#if M0 > 4

2314

VEC_DATA_TYPE(DATA_TYPE, 2)

2315

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

2316

#endif // M0 > 4

2317

#if M0 > 5

2318

VEC_DATA_TYPE(DATA_TYPE, 2)

2319

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

2320

#endif // M0 > 5

2321

#if M0 > 6

2322

VEC_DATA_TYPE(DATA_TYPE, 2)

2323

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

2324

#endif // M0 > 6

2325

#if M0 > 7

2326

VEC_DATA_TYPE(DATA_TYPE, 2)

2327

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

2328

#endif // M0 > 7

2329

2330

VEC_DATA_TYPE(DATA_TYPE, N0)

2331

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

2332

RHS_VFMA_M0xN0(0, a, b, c);

2333

2334

lhs_offset += sizeof(DATA_TYPE);

2335

rhs_offset += rhs_stride_y;

2336

}

2337

2338

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2339

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2340

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2341

2342

#if defined(REINTERPRET_OUTPUT_AS_3D)

2343

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2344

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2345

2346

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2347

// multiply dst_stride_z by DEPTH_GEMM3D

2348

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2349

2350

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2351

2352

// Add offset for batched GEMM

2353

dst_addr += z * dst_stride_z;

2354

2355

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2356

2357

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2358

#if defined(ALPHA)

2359

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2360

#endif // defined(ALPHA)

2361

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

2362

// Add beta*bias

2363

#if defined(BETA)

2364

#if defined(BROADCAST_BIAS)

2365

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2366

2367

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2368

2369

#ifndef UNIT_BETA

2370

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2371

#endif // UNIT_BIAS

2372

2373

// c = c + bias[broadcasted]

2374

ADD_BLOCK_BROADCAST(M0, c, bias0);

2375

2376

#else // defined(BROADCAST_BIAS)

2377

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2378

2) * bias_stride_z;

2379

2380

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2381

2382

#ifndef UNIT_BETA

2383

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2388

2389

#endif // defined(BROADCAST_BIAS)

2390

#endif // defined(BETA)

2391

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2392

#if defined(ACTIVATION_TYPE)

2393

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2394

#endif // defined(ACTIVATION_TYPE)

2395

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2396

// Store output block

2397

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2398

2399

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

2404

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2405

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2406

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

2407

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2408

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2409

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

2410

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

2411

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

2412

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2413

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2414

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2415

* The activation function is performed after the bias addition

2416

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2417

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2418

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2419

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2420

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2421

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2422

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2423

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2424

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2425

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2426

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2427

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2428

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2429

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2430

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2431

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2432

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2433

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2434

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2435

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2436

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

2437

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2438

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

2439

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2440

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2441

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2442

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2443

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2444

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2445

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2446

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2447

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2448

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2449

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2450

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2451

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

2452

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

2453

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2454

#if defined(BETA)

2455

IMAGE_DECLARATION(src2),

2456

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

2457

IMAGE_DECLARATION(dst),

2458

uint src0_stride_z,

2459

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2460

#if defined(BETA)

2461

uint src2_stride_z,

2462

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2463

uint dst_stride_z

2464

#if defined(REINTERPRET_OUTPUT_AS_3D)

2465

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2466

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2467

#endif // REINTERPRET_OUTPUT_AS_3D

2468

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2469

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2470

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

2471

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2472

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2473

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2474

// Offset

2475

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

2476

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2477

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2478

// src_addr_a = address of matrix A

2479

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2480

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

2481

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

2482

2483

#if defined(MATRIX_B_DEPTH)

2484

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2485

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

2486

#else // defined(MATRIX_B_DEPTH)

2487

src1_addr_in_bytes += z * src1_stride_z;

2488

#endif // defined(MATRIX_B_DEPTH)

2489

2490

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

2491

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2492

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2493

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2494

__global float *src_end_addr_b = src_addr_b + COLS_B;

2495

2496

src_addr_a += offset_row_a;

2497

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2498

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2499

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2504

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2505

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2506

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2507

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2508

float4 a0 = vload4(0, src_addr_a);

2509

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2510

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2511

c0 += (float4)a0.s0 * b0;

2512

c1 += (float4)a0.s1 * b0;

2513

c2 += (float4)a0.s2 * b0;

2514

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2515

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2516

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2517

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

2518

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2519

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2520

c0 += (float4)a0.s0 * b0;

2521

c1 += (float4)a0.s1 * b0;

2522

c2 += (float4)a0.s2 * b0;

2523

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2524

}

2525

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2526

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2527

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2528

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2529

float4 a0 = vload4(0, src_addr_a);

2530

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2531

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2532

c0 += (float4)a0.s0 * b0;

2533

c1 += (float4)a0.s1 * b0;

2534

c2 += (float4)a0.s2 * b0;

2535

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2536

}

2537

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2538

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2539

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2540

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2541

// Compute dst address

2542

__global uchar *dst_addr = offset(&dst, 0, 0);

2543

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2544

uint4 zout = 0;

2545

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2546

#if defined(REINTERPRET_OUTPUT_AS_3D)

2547

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2548

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2549

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2554

// |******************|

2555

// | cross_plane_pad |

2556

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2561

2562

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2563

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

2564

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2565

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2566

// Add offset due to the cross plane paddings

2567

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2568

2569

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2570

// multiply dst_stride_z by DEPTH_GEMM3D

2571

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2572

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2573

// Add offset for batched GEMM

2574

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2575

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2576

2577

// Multiply by the weight of matrix-matrix product and store the result

2578

#if defined(ALPHA)

2579

SCALE_BLOCK(4, float, c, ALPHA);

2580

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

2585

2586

#if defined(BROADCAST_BIAS)

2587

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

2588

2589

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

2590

2591

#ifndef UNIT_BETA

2592

SCALE_BLOCK(1, float, bias, BETA);

2593

#endif // UNIT_BIAS

2594

2595

// c = c + bias[broadcasted]

2596

ADD_BLOCK_BROADCAST(4, c, bias0);

2597

2598

#else // defined(BROADCAST_BIAS)

2599

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

2600

2) * src2_stride_z;

2601

2602

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

2603

2604

#ifndef UNIT_BETA

2605

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

2610

2611

#endif // defined(BROADCAST_BIAS)

2612

#endif // defined(BETA)

2613

2614

#if defined(ACTIVATION_TYPE)

2615

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

2616

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2617

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2618

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2619

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

2620

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

2621

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

2622

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2623

}

2624

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2625

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2626

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2627

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2628

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

2629

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

2630

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

2631

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

2632

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2633

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2634

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2635

* The activation function is performed after the bias addition

2636

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2637

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2638

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2639

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2640

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2641

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2642

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2643

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2644

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2645

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2646

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2647

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2648

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2649

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2650

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2651

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2652

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2653

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2654

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2655

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2656

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

2657

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2658

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

2659

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2660

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2661

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2662

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2663

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2664

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2665

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2666

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2667

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2668

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2669

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

2670

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2671

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

2672

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

2673

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2674

#if defined(BETA)

2675

IMAGE_DECLARATION(src2),

2676

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2677

IMAGE_DECLARATION(dst),

2678

uint src0_stride_z,

2679

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2680

#if defined(BETA)

2681

uint src2_stride_z,

2682

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2683

uint dst_stride_z

2684

#if defined(REINTERPRET_OUTPUT_AS_3D)

2685

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2686

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2687

#endif // REINTERPRET_OUTPUT_AS_3D

2688

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2689

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2690

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

2691

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2692

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2693

2694

// Offset

2695

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

2696

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

2697

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2698

// src_addr_a = address of matrix A

2699

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

2700

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

2701

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

2702

2703

#if defined(MATRIX_B_DEPTH)

2704

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2705

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

2706

#else // defined(MATRIX_B_DEPTH)

2707

src1_addr_in_bytes += z * src1_stride_z;

2708

#endif // defined(MATRIX_B_DEPTH)

2709

2710

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

2711

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2712

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2713

src_addr_a += offset_row_a;

2714

src_addr_b += offset_row_b;

2715

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2716

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2721

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2722

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

2723

2724

int i = 0;

2725

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2726

{

2727

// Load values from matrix A (interleaved) and matrix B (transposed)

2728

float4 a0 = vload4(0, src_addr_a);

2729

float4 b0 = vload4(0, src_addr_b);

2730

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2731

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

2732

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2733

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2734

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

2735

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

2736

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

2737

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2738

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2739

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

2740

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

2741

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

2742

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2743

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2744

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

2745

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

2746

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

2747

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2748

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2749

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

2750

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

2751

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

2752

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2753

2754

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2755

a0 = vload4(0, src_addr_a);

2756

b0 = vload4(0, src_addr_b);

2757

2758

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

2759

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2760

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2761

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

2762

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

2763

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

2764

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2765

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2766

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

2767

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

2768

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

2769

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2770

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2771

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

2772

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

2773

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

2774

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2775

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2776

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

2777

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

2778

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

2779

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2780

2781

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2782

a0 = vload4(0, src_addr_a);

2783

b0 = vload4(0, src_addr_b);

2784

2785

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

2786

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

2787

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2788

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

2789

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

2790

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

2791

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2792

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2793

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

2794

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

2795

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

2796

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2797

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2798

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

2799

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

2800

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

2801

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2802

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2803

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

2804

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

2805

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

2806

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2807

2808

// Load values from matrix A (interleaved) and matrix B (transposed)

2809

a0 = vload4(0, src_addr_a);

2810

b0 = vload4(0, src_addr_b);

2811

2812

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

2813

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2814

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2815

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

2816

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

2817

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

2818

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2819

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2820

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

2821

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

2822

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

2823

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2824

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2825

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

2826

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

2827

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

2828

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2829

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2830

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

2831

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

2832

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

2833

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2834

}

2835

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2836

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2837

{

2838

// Load values from matrix A (interleaved) and matrix B (transposed)

2839

float4 a0 = vload4(0, src_addr_a);

2840

float4 b0 = vload4(0, src_addr_b);

2841

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

2842

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

2843

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

2844

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2845

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

2846

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

2847

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

2848

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2849

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2850

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

2851

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

2852

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

2853

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2854

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2855

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

2856

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

2857

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

2858

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2859

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2860

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

2861

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

2862

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

2863

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2864

}

2865

2866

// Compute destination address

2867

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2868

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2869

// Compute dst address

2870

__global uchar *dst_addr = offset(&dst, 0, 0);

2871

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2872

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

2873

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2874

#if defined(REINTERPRET_OUTPUT_AS_3D)

2875

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2876

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2877

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

2882

// |******************|

2883

// | cross_plane_pad |

2884

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2889

2890

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2891

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

2892

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2893

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

2894

// Add offset due to the cross plane paddings

2895

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2896

2897

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2898

// multiply dst_stride_z by DEPTH_GEMM3D

2899

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2900

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2901

// Add offset for batched GEMM

2902

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2903

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2904

2905

// Multiply by the weight of matrix-matrix product and store the result

2906

#if defined(ALPHA)

2907

SCALE_BLOCK(4, float, c, ALPHA);

2908

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

2913

2914

#if defined(BROADCAST_BIAS)

2915

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

2916

2917

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

2918

2919

#ifndef UNIT_BETA

2920

SCALE_BLOCK(1, float, bias, BETA);

2921

#endif // UNIT_BIAS

2922

2923

// c = c + bias[broadcasted]

2924

ADD_BLOCK_BROADCAST(4, c, bias0);

2925

2926

#else // defined(BROADCAST_BIAS)

2927

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

2928

2) * src2_stride_z;

2929

2930

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

2931

2932

#ifndef UNIT_BETA

2933

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

2938

2939

#endif // defined(BROADCAST_BIAS)

2940

#endif // defined(BETA)

2941

2942

#if defined(ACTIVATION_TYPE)

2943

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

2944

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

2945

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2946

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2947

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

2948

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

2949

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

2950

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2951

}

2952

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

2953

// Undefine local defines

2954

#undef COLS_MTX_B

2955

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

2956

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2957

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

2958

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

2959

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2960

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

2961

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

2962

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

2963

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2964

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2965

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2966

* The activation function is performed after the bias addition

2967

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2968

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2969

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2970

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2971

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

2972

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2973

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

2974

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2975

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2976

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2977

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2978

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2979

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2980

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2981

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2982

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2983

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2984

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2985

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2986

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2987

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

2988

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2989

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

2990

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2991

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2992

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2993

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2994

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

2995

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2996

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

2997

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

2998

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

2999

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3000

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3001

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3002

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3003

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

3004

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3005

#if defined(BETA)

3006

IMAGE_DECLARATION(src2),

3007

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3008

IMAGE_DECLARATION(dst),

3009

uint src0_stride_z,

3010

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3011

#if defined(BETA)

3012

uint src2_stride_z,

3013

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3014

uint dst_stride_z

3015

#if defined(REINTERPRET_OUTPUT_AS_3D)

3016

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3017

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3018

#endif // REINTERPRET_OUTPUT_AS_3D

3019

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3020

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3021

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3022

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3023

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3024

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3025

// Offset

3026

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3027

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3028

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3029

// src_addr_a = address of matrix A

3030

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3031

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3032

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3033

3034

#if defined(MATRIX_B_DEPTH)

3035

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3036

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3037

#else // defined(MATRIX_B_DEPTH)

3038

src1_addr_in_bytes += z * src1_stride_z;

3039

#endif // defined(MATRIX_B_DEPTH)

3040

3041

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

3042

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3043

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3044

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3045

__global half *src_end_addr_b = src_addr_b + COLS_B;

3046

3047

src_addr_a += offset_row_a;

3048

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3049

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3050

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3055

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3056

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3057

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3058

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3059

half4 a0 = vload4(0, src_addr_a);

3060

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3061

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3062

c0 += (half8)a0.s0 * b0;

3063

c1 += (half8)a0.s1 * b0;

3064

c2 += (half8)a0.s2 * b0;

3065

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3066

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3067

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3068

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

3069

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3070

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3071

c0 += (half8)a0.s0 * b0;

3072

c1 += (half8)a0.s1 * b0;

3073

c2 += (half8)a0.s2 * b0;

3074

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3075

}

3076

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3077

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3078

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3079

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3080

half4 a0 = vload4(0, src_addr_a);

3081

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3082

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3083

c0 += (half8)a0.s0 * b0;

3084

c1 += (half8)a0.s1 * b0;

3085

c2 += (half8)a0.s2 * b0;

3086

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3087

}

3088

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3089

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3090

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3091

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3092

// Compute dst address

3093

__global uchar *dst_addr = offset(&dst, 0, 0);

3094

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3095

uint4 zout = 0;

3096

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3097

#if defined(REINTERPRET_OUTPUT_AS_3D)

3098

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3099

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3100

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3105

// |******************|

3106

// | cross_plane_pad |

3107

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3112

3113

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3114

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3115

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3116

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3117

// Add offset due to the cross plane paddings

3118

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3119

3120

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3121

// multiply dst_stride_z by DEPTH_GEMM3D

3122

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3123

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3124

// Add offset for batched GEMM

3125

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3126

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3127

3128

// Multiply by the weight of matrix-matrix product and store the result

3129

#if defined(ALPHA)

3130

SCALE_BLOCK(4, half, c, ALPHA);

3131

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3136

3137

#if defined(BROADCAST_BIAS)

3138

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

3139

3140

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3141

3142

#ifndef UNIT_BETA

3143

SCALE_BLOCK(1, half, bias, BETA);

3144

#endif // UNIT_BIAS

3145

3146

// c = c + bias[broadcasted]

3147

ADD_BLOCK_BROADCAST(4, c, bias0);

3148

3149

#else // defined(BROADCAST_BIAS)

3150

3151

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3152

2) * src2_stride_z;

3153

3154

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3155

3156

#ifndef UNIT_BETA

3157

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

3162

3163

#endif // defined(BROADCAST_BIAS)

3164

#endif // defined(BETA)

3165

3166

#if defined(ACTIVATION_TYPE)

3167

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

3168

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3169

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3170

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3171

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

3172

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

3173

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

3174

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3175

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3176

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3177

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3178

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3179

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3180

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3181

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3182

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3183

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3184

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3185

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3186

* The activation function is performed after the bias addition

3187

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3188

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3189

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3190

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3191

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3192

*

3193

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

3194

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3195

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3196

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3197

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3198

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

3199

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

3200

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3201

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3202

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3203

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3204

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3205

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3206

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3207

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3208

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3209

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3210

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3211

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

3212

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3213

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3214

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3215

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3216

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3217

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3218

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3219

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3220

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3221

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3222

*/

3223

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

3224

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3225

#if defined(BETA)

3226

IMAGE_DECLARATION(src2),

3227

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3228

IMAGE_DECLARATION(dst),

3229

uint src0_stride_z,

3230

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3231

#if defined(BETA)

3232

uint src2_stride_z,

3233

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3234

uint dst_stride_z

3235

#if defined(REINTERPRET_OUTPUT_AS_3D)

3236

,

3237

uint cross_plane_pad

3238

#endif // REINTERPRET_OUTPUT_AS_3D

3239

)

3240

{

3241

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3242

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

3243

int z = get_global_id(2);

3244

3245

// Offset

3246

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3247

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

3248

3249

// src_addr_a = address of matrix A

3250

// src_addr_b = address of matrix B

3251

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3252

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3253

3254

#if defined(MATRIX_B_DEPTH)

3255

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3256

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3257

#else // defined(MATRIX_B_DEPTH)

3258

src1_addr_in_bytes += z * src1_stride_z;

3259

#endif // defined(MATRIX_B_DEPTH)

3260

3261

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

3262

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

3263

3264

// Compute end row address for matrix B

3265

__global half *src_end_addr_b = src_addr_b + COLS_B;

3266

3267

src_addr_a += offset_row_a;

3268

src_addr_b += offset_row_b;

3269

3270

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3275

3276

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

3277

{

3278

// Load values from matrix A (interleaved) and matrix B (transposed)

3279

float4 a0 = convert_float4(vload4(0, src_addr_a));

3280

float8 b0 = convert_float8(vload8(0, src_addr_b));

3281

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3282

c0 += (float8)a0.s0 * b0;

3283

c1 += (float8)a0.s1 * b0;

3284

c2 += (float8)a0.s2 * b0;

3285

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3286

3287

// Load values from matrix A (interleaved) and matrix B (transposed)

3288

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

3289

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

3290

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3291

c0 += (float8)a0.s0 * b0;

3292

c1 += (float8)a0.s1 * b0;

3293

c2 += (float8)a0.s2 * b0;

3294

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3295

}

3296

3297

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

3298

{

3299

// Load values from matrix A (interleaved) and matrix B (transposed)

3300

float4 a0 = convert_float4(vload4(0, src_addr_a));

3301

float8 b0 = convert_float8(vload8(0, src_addr_b));

3302

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3303

c0 += (float8)a0.s0 * b0;

3304

c1 += (float8)a0.s1 * b0;

3305

c2 += (float8)a0.s2 * b0;

3306

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3307

}

3308

3309

// Compute destination address

3310

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3311

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3312

// Compute dst address

3313

__global uchar *dst_addr = offset(&dst, 0, 0);

3314

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3315

uint4 zout = 0;

3316

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3317

#if defined(REINTERPRET_OUTPUT_AS_3D)

3318

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

3319

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

3325

// |******************|

3326

// | cross_plane_pad |

3327

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

3332

3333

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3334

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3335

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3336

3337

// Add offset due to the cross plane paddings

3338

zout *= (cross_plane_pad * dst_stride_y);

3339

3340

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3341

// multiply dst_stride_z by DEPTH_GEMM3D

3342

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3343

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3344

// Add offset for batched GEMM

3345

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3346

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3347

3348

// Multiply by the weight of matrix-matrix product and store the result

3349

#if defined(ALPHA)

3350

SCALE_BLOCK(4, float, c, ALPHA);

3351

#endif // defined(ALPHA)

3352

3353

#if defined(BETA)

3354

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3355

3356

#if defined(BROADCAST_BIAS)

3357

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

3358

3359

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3360

3361

float8 bias_f0 = convert_float8(bias0);

3362

3363

#ifndef UNIT_BETA

3364

SCALE_BLOCK(1, float, bias_f, BETA);

3365

#endif // UNIT_BIAS

3366

3367

// c = c + bias[broadcasted]

3368

ADD_BLOCK_BROADCAST(4, c, bias_f0);

3369

3370

#else // defined(BROADCAST_BIAS)

3371

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3372

2) * src2_stride_z;

3373

3374

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3375

3376

float8 bias_f0 = convert_float8(bias0);

3377

float8 bias_f1 = convert_float8(bias1);

3378

float8 bias_f2 = convert_float8(bias2);

3379

float8 bias_f3 = convert_float8(bias3);

3380

3381

#ifndef UNIT_BETA

3382

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

3387

3388

#endif // defined(BROADCAST_BIAS)

3389

#endif // defined(BETA)

3390

3391

half8 c_h0 = convert_half8(c0);

3392

half8 c_h1 = convert_half8(c1);

3393

half8 c_h2 = convert_half8(c2);

3394

half8 c_h3 = convert_half8(c3);

3395

3396

#if defined(ACTIVATION_TYPE)

3397

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

3398

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3399

3400

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3401

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

3402

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

3403

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

3404

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

3405

}

3406

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3407

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3408

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3409

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3410

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

3411

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

3412

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3413

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3414

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3415

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3416

* The activation function is performed after the bias addition

3417

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3418

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3419

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3420

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3421

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3422

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3423

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

3424

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3425

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3426

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3427

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3428

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

3429

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

3430

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3431

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3432

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3433

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3434

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3435

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3436

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3437

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3438

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3439

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3440

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3441

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

3442

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3443

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3444

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3445

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3446

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3447

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3448

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

3449

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3450

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3451

*/

3452

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

3453

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3454

#if defined(BETA)

3455

IMAGE_DECLARATION(src2),

3456

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3457

IMAGE_DECLARATION(dst),

3458

uint src0_stride_z,

3459

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3460

#if defined(BETA)

3461

uint src2_stride_z,

3462

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3463

uint dst_stride_z

3464

#if defined(REINTERPRET_OUTPUT_AS_3D)

3465

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3466

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3467

#endif // REINTERPRET_OUTPUT_AS_3D

3468

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3469

{

3470

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

3471

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

3472

int z = get_global_id(2);

3473

3474

// Offset

3475

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

3476

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

3477

3478

// src_addr_a = address of matrix A

3479

// src_addr_b = address of matrix B

3480

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

3481

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

3482

3483

#if defined(MATRIX_B_DEPTH)

3484

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3485

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

3486

#else // defined(MATRIX_B_DEPTH)

3487

src1_addr_in_bytes += z * src1_stride_z;

3488

#endif // defined(MATRIX_B_DEPTH)

3489

3490

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

3491

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

3492

3493

// Compute end row address for matrix B

3494

__global half *src_end_addr_b = src_addr_b + COLS_B;

3495

3496

src_addr_a += offset_row_a;

3497

src_addr_b += offset_row_b;

3498

3499

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3504

3505

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

3506

3507

int i = 0;

3508

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

3509

{

3510

#if MULT_INTERLEAVE4X4_HEIGHT == 1

3511

// Load values from matrix A (interleaved) and matrix B (transposed)

3512

half8 a0 = vload8(0, src_addr_a);

3513

half8 b0 = vload8(0, src_addr_b);

3514

3515

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

3516

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3517

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3518

c0 = fma((half8)a0.s0, b0, c0);

3519

c1 = fma((half8)a0.s1, b0, c1);

3520

c2 = fma((half8)a0.s2, b0, c2);

3521

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3522

3523

// Load values from matrix B (transposed)

3524

b0 = vload8(0, src_addr_b);

3525

3526

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3527

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3528

c0 = fma((half8)a0.s4, b0, c0);

3529

c1 = fma((half8)a0.s5, b0, c1);

3530

c2 = fma((half8)a0.s6, b0, c2);

3531

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3532

3533

// Load values from matrix A (interleaved) and matrix B (transposed)

3534

a0 = vload8(0, src_addr_a);

3535

b0 = vload8(0, src_addr_b);

3536

3537

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

3538

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3539

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3540

c0 = fma((half8)a0.s0, b0, c0);

3541

c1 = fma((half8)a0.s1, b0, c1);

3542

c2 = fma((half8)a0.s2, b0, c2);

3543

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3544

3545

// Load values from matrix B (transposed)

3546

b0 = vload8(0, src_addr_b);

3547

3548

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3549

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3550

c0 = fma((half8)a0.s4, b0, c0);

3551

c1 = fma((half8)a0.s5, b0, c1);

3552

c2 = fma((half8)a0.s6, b0, c2);

3553

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3554

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

3555

// Load values from matrix A (interleaved) and matrix B (transposed)

3556

half4 a0 = vload4(0, src_addr_a);

3557

half8 b0 = vload8(0, src_addr_b);

3558

3559

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3560

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3561

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3562

c0 = fma((half8)a0.s0, b0, c0);

3563

c1 = fma((half8)a0.s1, b0, c1);

3564

c2 = fma((half8)a0.s2, b0, c2);

3565

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3566

3567

// Load values from matrix A (interleaved) and matrix B (transposed)

3568

a0 = vload4(0, src_addr_a);

3569

b0 = vload8(0, src_addr_b);

3570

3571

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3572

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3573

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3574

c0 = fma((half8)a0.s0, b0, c0);

3575

c1 = fma((half8)a0.s1, b0, c1);

3576

c2 = fma((half8)a0.s2, b0, c2);

3577

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3578

3579

// Load values from matrix A (interleaved) and matrix B (transposed)

3580

a0 = vload4(0, src_addr_a);

3581

b0 = vload8(0, src_addr_b);

3582

3583

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3584

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3585

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3586

c0 = fma((half8)a0.s0, b0, c0);

3587

c1 = fma((half8)a0.s1, b0, c1);

3588

c2 = fma((half8)a0.s2, b0, c2);

3589

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3590

3591

// Load values from matrix A (interleaved) and matrix B (transposed)

3592

a0 = vload4(0, src_addr_a);

3593

b0 = vload8(0, src_addr_b);

3594

3595

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3596

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3597

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3598

c0 = fma((half8)a0.s0, b0, c0);

3599

c1 = fma((half8)a0.s1, b0, c1);

3600

c2 = fma((half8)a0.s2, b0, c2);

3601

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3602

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

3603

}

3604

3605

for(; i < (int)(COLS_MTX_B); ++i)

3606

{

3607

// Load values from matrix A (interleaved) and matrix B (transposed)

3608

half4 a0 = vload4(0, src_addr_a);

3609

half8 b0 = vload8(0, src_addr_b);

3610

3611

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

3612

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

3613

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3614

c0 = fma((half8)a0.s0, b0, c0);

3615

c1 = fma((half8)a0.s1, b0, c1);

3616

c2 = fma((half8)a0.s2, b0, c2);

3617

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3618

}

3619

3620

// Compute destination address

3621

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3622

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3623

// Compute dst address

3624

__global uchar *dst_addr = offset(&dst, 0, 0);

3625

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3626

uint4 zout = 0;

3627

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3628

#if defined(REINTERPRET_OUTPUT_AS_3D)

3629

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3630

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3631

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3636

// |******************|

3637

// | cross_plane_pad |

3638

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3643

3644

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3645

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

3646

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3647

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3648

// Add offset due to the cross plane paddings

3649

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3650

3651

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3652

// multiply dst_stride_z by DEPTH_GEMM3D

3653

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3654

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3655

// Add offset for batched GEMM

3656

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3657

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3658

3659

// Multiply by the weight of matrix-matrix product and store the result

3660

#if defined(ALPHA)

3661

SCALE_BLOCK(4, half, c, ALPHA);

3662

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

3667

3668

#if defined(BROADCAST_BIAS)

3669

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

3670

3671

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3672

3673

#ifndef UNIT_BETA

3674

SCALE_BLOCK(1, half, bias, BETA);

3675

#endif // UNIT_BIAS

3676

3677

// c = c + bias[broadcasted]

3678

ADD_BLOCK_BROADCAST(4, c, bias0);

3679

3680

#else // defined(BROADCAST_BIAS)

3681

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

3682

2) * src2_stride_z;

3683

3684

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

3685

3686

#ifndef UNIT_BETA

3687

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

3692

3693

#endif // defined(BROADCAST_BIAS)

3694

#endif // defined(BETA)

3695

3696

#if defined(ACTIVATION_TYPE)

3697

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

3698

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3699

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3700

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3701

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

3702

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

3703

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

3704

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

3705

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

3706

3707

// Undefine local defines

3708

#undef COLS_MTX_B

3709

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

3710

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3711

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

3712

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3713

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3714

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

3715

#if defined(DATA_TYPE)

3716

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

3717

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

3718

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3719

* @note This OpenCL kernel works with floating point data types (F16/F32)

3720

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

3721

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3722

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3723

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

3724

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3725

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3726

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3727

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3728

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

3729

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3730

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3731

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3732

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3733

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

3734

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3735

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3736

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

3737

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3738

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

3739

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3740

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3741

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3742

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

3743

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3744

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

3745

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3746

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3747

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3748

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3749

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

3750

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3751

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

3752

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

3753

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3754

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3755

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

3756

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3757

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

3758

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3759

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

3760

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3761

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3762

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3763

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

3764

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3765

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3766

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

3767

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3768

#if defined(BETA)

3769

IMAGE_DECLARATION(src2),

3770

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3771

IMAGE_DECLARATION(dst),

3772

uint src0_stride_z,

3773

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3774

#if defined(BETA)

3775

uint src2_stride_z,

3776

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3777

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3778

#if defined(REINTERPRET_INPUT_AS_3D)

3779

,

3780

uint src_cross_plane_pad

3781

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3782

#if defined(REINTERPRET_OUTPUT_AS_3D)

3783

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3784

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3785

#endif // REINTERPRET_OUTPUT_AS_3D

3786

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3787

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3788

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3789

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3790

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3791

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3792

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3793

// Update address for the matrix A

3794

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3795

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3796

// Update address for the matrix B

3797

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3798

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3799

#if defined(REINTERPRET_INPUT_AS_3D)

3800

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

3801

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

3807

// |******************|

3808

// | cross_plane_pad |

3809

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

3814

3815

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

3816

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

3817

zin = min(DEPTH_GEMM3D - 1, zin);

3818

3819

// Add offset due to the cross plane paddings

3820

zin *= (src_cross_plane_pad * src0_stride_y);

3821

3822

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3823

// multiply src0_stride_z by DEPTH_GEMM3D

3824

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

3825

3826

#else // defined(REINTERPRET_INPUT_AS_3D)

3827

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3828

// Add offset for batched GEMM

3829

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3830

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3831

#endif // defined(REINTERPRET_INPUT_AS_3D)

3832

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3833

#if defined(MATRIX_B_DEPTH)

3834

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3835

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

3836

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3837

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

3838

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3839

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3840

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

3841

3842

VECTOR_TYPE acc0 = 0.0f;

3843

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3844

VECTOR_TYPE acc1 = 0.0f;

3845

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3846

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3847

VECTOR_TYPE acc2 = 0.0f;

3848

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3849

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3850

VECTOR_TYPE acc3 = 0.0f;

3851

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3852

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

3853

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3854

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3855

#if defined(REINTERPRET_INPUT_AS_3D)

3856

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

3857

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

3858

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3859

// Load values from matrix A

3860

VEC_DATA_TYPE(DATA_TYPE, 2)

3861

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

3862

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3863

VEC_DATA_TYPE(DATA_TYPE, 2)

3864

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

3865

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3866

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3867

VEC_DATA_TYPE(DATA_TYPE, 2)

3868

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

3869

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3870

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3871

VEC_DATA_TYPE(DATA_TYPE, 2)

3872

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

3873

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3874

#endif // defined(REINTERPRET_INPUT_AS_3D)

3875

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3876

// Load values from matrix B

3877

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

3878

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3879

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3880

// Accumulate

3881

acc0 += b0 * (VECTOR_TYPE)a0.s0;

3882

acc0 += b1 * (VECTOR_TYPE)a0.s1;

3883

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3884

acc1 += b0 * (VECTOR_TYPE)a1.s0;

3885

acc1 += b1 * (VECTOR_TYPE)a1.s1;

3886

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3887

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3888

acc2 += b0 * (VECTOR_TYPE)a2.s0;

3889

acc2 += b1 * (VECTOR_TYPE)a2.s1;

3890

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3891

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3892

acc3 += b0 * (VECTOR_TYPE)a3.s0;

3893

acc3 += b1 * (VECTOR_TYPE)a3.s1;

3894

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3895

}

3896

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3897

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3898

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3899

#if defined(REINTERPRET_INPUT_AS_3D)

3900

// Load values from matrix A

3901

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

3902

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3903

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

3904

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3905

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3906

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

3907

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3908

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3909

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

3910

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3911

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3912

// Load values from matrix A

3913

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

3914

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3915

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

3916

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3917

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3918

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

3919

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3920

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3921

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

3922

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3923

#endif // defined(REINTERPRET_INPUT_AS_3D)

3924

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3925

// Load values from matrix B

3926

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3927

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3928

// Accumulate

3929

acc0 += b0 * (VECTOR_TYPE)a0;

3930

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3931

acc1 += b0 * (VECTOR_TYPE)a1;

3932

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

3933

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3934

acc2 += b0 * (VECTOR_TYPE)a2;

3935

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

3936

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

3937

acc3 += b0 * (VECTOR_TYPE)a3;

3938

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3939

}

3940

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3941

int z = get_global_id(2);

3942

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

3943

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3944

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

3945

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

3946

// Compute dst address

3947

__global uchar *dst_addr = offset(&dst, 0, 0);

3948

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3949

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3950

3951

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3952

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3953

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3954

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3955

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

3960

// |******************|

3961

// | cross_plane_pad |

3962

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3967

3968

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3969

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

3970

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3971

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

3972

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

3973

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3974

3975

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3976

// multiply dst_stride_z by DEPTH_GEMM3D

3977

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

3978

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3979

// Add offset for batched GEMM

3980

dst_addr += z * dst_stride_z;

3981

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3982

3983

// Multiply by the weight of matrix-matrix product and store the result

3984

#if defined(ALPHA)

3985

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

3986

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

3991

3992

#if defined(BROADCAST_BIAS)

3993

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

3994

3995

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

3996

3997

#ifndef UNIT_BETA

3998

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3999

#endif // UNIT_BIAS

4000

4001

// c = c + bias[broadcasted]

4002

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

4003

4004

#else // defined(BROADCAST_BIAS)

4005

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

4006

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

4007

4008

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

4009

4010

#ifndef UNIT_BETA

4011

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

4016

4017

#endif // defined(BROADCAST_BIAS)

4018

#endif // defined(BETA)

4019

4020

#if defined(ACTIVATION_TYPE)

4021

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

4022

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4023

4024

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

4025

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4026

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4027

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4028

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

4029

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4030

*

4031

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

4032

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

4033

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

4034

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

4035

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4036

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4037

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4038

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4039

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4040

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4041

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4042

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4043

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4044

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4045

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4046

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4047

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4048

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4049

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4050

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4051

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4052

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4053

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

4054

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

4055

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4056

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4057

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4058

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4059

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4060

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4061

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4062

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4063

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4064

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4065

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4066

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

4067

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4068

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4069

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4070

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

4071

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4072

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4073

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4074

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4075

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4076

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

4077

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4078

*/

4079

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

4080

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4081

#if defined(BETA)

4082

IMAGE_DECLARATION(src2),

4083

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4084

IMAGE_DECLARATION(dst),

4085

uint src0_stride_z,

4086

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4087

#if defined(BETA)

4088

uint src2_stride_z,

4089

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4090

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4091

#if defined(REINTERPRET_INPUT_AS_3D)

4092

,

4093

uint src_cross_plane_pad

4094

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4095

#if defined(REINTERPRET_OUTPUT_AS_3D)

4096

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4097

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4098

#endif // REINTERPRET_OUTPUT_AS_3D

4099

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4100

{

4101

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

4102

4103

// Compute starting address for matrix A and matrix B

4104

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

4105

4106

// Update address for matrix A

4107

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

4108

4109

// Update address for matrix B

4110

src_addr.s1 += idx * sizeof(float);

4111

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4112

#if defined(REINTERPRET_INPUT_AS_3D)

4113

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

4114

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

4120

// |******************|

4121

// | cross_plane_pad |

4122

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

4127

4128

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

4129

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4130

zin = min(DEPTH_GEMM3D - 1, zin);

4131

4132

// Add offset due to the cross plane paddings

4133

zin *= (src_cross_plane_pad * src0_stride_y);

4134

4135

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4136

// multiply src0_stride_z by DEPTH_GEMM3D

4137

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

4138

4139

#else // defined(REINTERPRET_INPUT_AS_3D)

4140

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4141

// Add offset for batched GEMM

4142

src_addr.s0 += get_global_id(2) * src0_stride_z;

4143

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4144

#endif // defined(REINTERPRET_INPUT_AS_3D)

4145

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4146

#if defined(MATRIX_B_DEPTH)

4147

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4148

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

4149

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4150

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4151

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4152

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4153

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4154

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4155

4156

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4157

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4158

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4159

4160

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4161

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4162

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4163

4164

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4165

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4166

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4167

4168

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4169

int i = 0;

4170

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4171

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4172

#if defined(REINTERPRET_INPUT_AS_3D)

4173

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

4174

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

4175

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4176

// Load values from matrix A and matrix B

4177

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4178

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4179

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4180

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4181

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4182

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4183

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4184

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4185

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4186

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4187

#endif // defined(REINTERPRET_INPUT_AS_3D)

4188

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4189

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4190

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4191

4192

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4193

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

4194

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

4195

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

4196

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4197

4198

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4199

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4200

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

4201

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

4202

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

4203

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4204

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4205

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4206

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4207

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4208

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

4209

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

4210

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

4211

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4212

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4213

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4214

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4215

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4216

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

4217

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

4218

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

4219

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4220

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4221

4222

// Load values from matrix A and matrix B

4223

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4224

src_addr.s1 += src1_stride_y;

4225

4226

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4227

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

4228

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

4229

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

4230

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4231

4232

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4233

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4234

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

4235

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

4236

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

4237

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4238

4239

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4240

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4241

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4242

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

4243

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

4244

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

4245

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4246

4247

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4248

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4249

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4250

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

4251

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

4252

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

4253

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4254

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4255

4256

// Load values from matrix A and matrix B

4257

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4258

src_addr.s1 += src1_stride_y;

4259

4260

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4261

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

4262

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

4263

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

4264

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4265

4266

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4267

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4268

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

4269

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

4270

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

4271

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4272

4273

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4274

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4275

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4276

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

4277

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

4278

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

4279

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4280

4281

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4282

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4283

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4284

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

4285

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

4286

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

4287

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4288

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4289

4290

// Load values from matrix A and matrix B

4291

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

4292

src_addr.s1 += src1_stride_y;

4293

4294

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4295

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

4296

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

4297

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

4298

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4299

4300

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4301

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4302

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

4303

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

4304

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

4305

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4306

4307

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4308

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4309

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4310

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

4311

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

4312

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

4313

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4314

4315

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4316

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4317

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4318

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

4319

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

4320

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

4321

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4322

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4323

4324

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4325

}

4326

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4327

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4328

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4329

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4330

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4331

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

4332

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4333

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

4334

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4335

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4336

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

4337

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4338

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4339

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

4340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4341

#else // defined(REINTERPRET_INPUT_AS_3D)

4342

// Load values from matrix A

4343

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4344

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4345

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

4346

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4347

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4348

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

4349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4351

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

4352

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4353

#endif // defined(REINTERPRET_INPUT_AS_3D)

4354

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4355

// Load values from matrix B

4356

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4357

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4358

4359

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4360

acc0.s0 = fma(a0, b0.s0, acc0.s0);

4361

acc0.s1 = fma(a0, b0.s1, acc0.s1);

4362

acc0.s2 = fma(a0, b0.s2, acc0.s2);

4363

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4364

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4365

acc1.s0 = fma(a1, b0.s0, acc1.s0);

4366

acc1.s1 = fma(a1, b0.s1, acc1.s1);

4367

acc1.s2 = fma(a1, b0.s2, acc1.s2);

4368

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4369

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4370

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4371

acc2.s0 = fma(a2, b0.s0, acc2.s0);

4372

acc2.s1 = fma(a2, b0.s1, acc2.s1);

4373

acc2.s2 = fma(a2, b0.s2, acc2.s2);

4374

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4375

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4376

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4377

acc3.s0 = fma(a3, b0.s0, acc3.s0);

4378

acc3.s1 = fma(a3, b0.s1, acc3.s1);

4379

acc3.s2 = fma(a3, b0.s2, acc3.s2);

4380

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4381

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4382

4383

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4384

}

4385

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4386

int z = get_global_id(2);

4387

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4388

// Compute destination address

4389

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4390

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4391

// Compute dst address

4392

__global uchar *dst_addr = offset(&dst, 0, 0);

4393

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4394

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4395

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4396

#if defined(REINTERPRET_OUTPUT_AS_3D)

4397

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4398

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4399

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4404

// |******************|

4405

// | cross_plane_pad |

4406

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4411

4412

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4413

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4414

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4415

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4416

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4417

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4418

4419

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4420

// multiply dst_stride_z by DEPTH_GEMM3D

4421

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4422

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4423

// Add offset for batched GEMM

4424

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4425

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4426

4427

// Multiply by the weight of matrix-matrix product and store the result

4428

#if defined(ALPHA)

4429

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

4430

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

4435

4436

#if defined(BROADCAST_BIAS)

4437

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4438

4439

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4440

4441

#ifndef UNIT_BETA

4442

SCALE_BLOCK(1, float, bias, BETA);

4443

#endif // UNIT_BIAS

4444

4445

// acc = acc + bias[broadcasted]

4446

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

4447

4448

#else // defined(BROADCAST_BIAS)

4449

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

4450

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

4451

4452

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4453

4454

#ifndef UNIT_BETA

4455

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

4460

4461

#endif // defined(BROADCAST_BIAS)

4462

#endif // defined(BETA)

4463

4464

#if defined(ACTIVATION_TYPE)

4465

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

4466

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4467

4468

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4469

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4470

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4471

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4472

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4473

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4474

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4475

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4476

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4477

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4478

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4479

}

4480

4481

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

4482

*

4483

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

4484

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

4485

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

4486

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

4487

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

4488

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4489

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4490

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4491

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4492

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4493

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4494

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4495

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4496

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4497

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4498

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4499

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4500

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4501

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4502

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4503

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4504

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4505

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4506

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

4507

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

4508

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4509

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4510

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4511

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4512

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4513

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4514

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4515

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4516

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4517

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4518

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4519

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

4520

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4521

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4522

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4523

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

4524

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4525

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4526

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4527

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4528

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4529

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

4530

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4531

*/

4532

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

4533

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4534

#if defined(BETA)

4535

IMAGE_DECLARATION(src2),

4536

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4537

IMAGE_DECLARATION(dst),

4538

uint src0_stride_z,

4539

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4540

#if defined(BETA)

4541

uint src2_stride_z,

4542

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4543

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4544

#if defined(REINTERPRET_INPUT_AS_3D)

4545

,

4546

uint src_cross_plane_pad

4547

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4548

#if defined(REINTERPRET_OUTPUT_AS_3D)

4549

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4550

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4551

#endif // REINTERPRET_OUTPUT_AS_3D

4552

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4553

{

4554

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4555

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

4556

4557

// Compute starting address for matrix A and Matrix B

4558

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

4559

4560

// Update address for the matrix A

4561

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

4562

4563

// Update address for the matrix B

4564

src_addr.s1 += idx * sizeof(float);

4565

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4566

#if defined(REINTERPRET_INPUT_AS_3D)

4567

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

4568

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

4574

// |******************|

4575

// | cross_plane_pad |

4576

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

4581

4582

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

4583

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4584

zin = min(DEPTH_GEMM3D - 1, zin);

4585

4586

// Add offset due to the cross plane paddings

4587

zin *= (src_cross_plane_pad * src0_stride_y);

4588

4589

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4590

// multiply src0_stride_z by DEPTH_GEMM3D

4591

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

4592

4593

#else // defined(REINTERPRET_INPUT_AS_3D)

4594

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4595

// Add offset for batched GEMM

4596

src_addr.s0 += get_global_id(2) * src0_stride_z;

4597

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4598

#endif // defined(REINTERPRET_INPUT_AS_3D)

4599

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4600

#if defined(MATRIX_B_DEPTH)

4601

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4602

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

4603

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4604

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4605

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4606

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4607

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4608

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4609

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4610

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4611

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4612

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4613

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4614

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4615

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4616

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4617

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4618

4619

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4620

int i = 0;

4621

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4622

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4623

#if defined(REINTERPRET_INPUT_AS_3D)

4624

// Load values from matrix A

4625

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

4626

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4627

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4628

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4629

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4630

4631

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4632

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4633

src_addr.s1 += src1_stride_y;

4634

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4635

src_addr.s1 += src1_stride_y;

4636

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4637

src_addr.s1 += src1_stride_y;

4638

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4639

src_addr.s1 += src1_stride_y;

4640

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4641

src_addr.s1 += src1_stride_y;

4642

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4643

src_addr.s1 += src1_stride_y;

4644

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4645

src_addr.s1 += src1_stride_y;

4646

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

4647

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4648

4649

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4650

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

4651

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

4652

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

4653

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

4654

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

4655

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

4656

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

4657

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4659

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

4660

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

4661

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

4662

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

4663

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

4664

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

4665

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

4666

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4667

4668

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4669

#if defined(REINTERPRET_INPUT_AS_3D)

4670

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

4671

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4672

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4673

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4674

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

4675

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

4676

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

4677

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

4678

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

4679

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

4680

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

4681

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4682

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4683

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

4684

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

4685

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

4686

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

4687

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

4688

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

4689

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

4690

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4691

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4692

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4693

#if defined(REINTERPRET_INPUT_AS_3D)

4694

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

4695

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4696

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4697

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4698

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

4699

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

4700

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

4701

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

4702

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

4703

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

4704

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

4705

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4706

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4707

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

4708

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

4709

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

4710

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

4711

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

4712

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

4713

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

4714

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4715

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4716

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4717

#if defined(REINTERPRET_INPUT_AS_3D)

4718

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

4719

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4720

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4721

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4722

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

4723

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

4724

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

4725

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

4726

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

4727

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

4728

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

4729

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4730

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4731

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

4732

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

4733

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

4734

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

4735

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

4736

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

4737

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

4738

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4739

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4740

4741

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4742

}

4743

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4744

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4745

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4746

#if defined(REINTERPRET_INPUT_AS_3D)

4747

// Load values from matrix A

4748

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

4749

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4750

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

4751

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4752

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4753

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

4754

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4755

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4756

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

4757

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4758

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4759

// Load values from matrix A

4760

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

4761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4762

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

4763

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4764

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4765

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

4766

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4767

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

4768

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

4769

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4770

#endif // defined(REINTERPRET_INPUT_AS_3D)

4771

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4772

// Load values from matrix B

4773

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4774

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4775

4776

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4777

acc0.s0 = fma(a0, b0.s0, acc0.s0);

4778

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4780

acc1.s0 = fma(a1, b0.s0, acc1.s0);

4781

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4782

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4783

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4784

acc2.s0 = fma(a2, b0.s0, acc2.s0);

4785

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4786

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4787

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4788

acc3.s0 = fma(a3, b0.s0, acc3.s0);

4789

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4790

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4791

4792

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4793

}

4794

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4795

int z = get_global_id(2);

4796

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4797

// Compute destination address

4798

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4799

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4800

// Compute dst address

4801

__global uchar *dst_addr = offset(&dst, 0, 0);

4802

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4803

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4804

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4805

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4806

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4807

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4808

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4809

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4814

// |******************|

4815

// | cross_plane_pad |

4816

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4821

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4822

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4823

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4824

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4825

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4826

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4827

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4828

4829

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4830

// multiply dst_stride_z by DEPTH_GEMM3D

4831

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4832

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4833

// Add offset for batched GEMM

4834

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4835

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4836

4837

// Multiply by the weight of matrix-matrix product and store the result

4838

#if defined(ALPHA)

4839

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

4840

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

4845

4846

#if defined(BROADCAST_BIAS)

4847

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

4848

4849

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

4850

4851

#ifndef UNIT_BETA

4852

SCALE_BLOCK(1, float, bias, BETA);

4853

#endif // UNIT_BIAS

4854

4855

// acc = acc + bias[broadcasted]

4856

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

4857

4858

#else // defined(BROADCAST_BIAS)

4859

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

4860

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

4861

4862

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

4863

4864

#ifndef UNIT_BETA

4865

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

4870

4871

#endif // defined(BROADCAST_BIAS)

4872

#endif // defined(BETA)

4873

4874

#if defined(ACTIVATION_TYPE)

4875

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

4876

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4877

4878

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4879

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4880

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4881

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4882

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

4883

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4884

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4885

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

4886

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4887

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4888

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4889

}

4890

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

4891

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

4892

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

4893

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4894

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

4895

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

4896

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

4897

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

4898

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4899

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4900

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4901

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4902

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4903

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4904

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4905

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

4906

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4907

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4908

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4909

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4910

*

4911

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4912

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4913

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4914

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4915

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4916

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

4917

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

4918

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4919

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4920

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4921

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4922

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4923

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4924

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4925

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4926

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4927

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4928

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4929

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

4930

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4931

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

4932

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4933

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

4934

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4935

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4936

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4937

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4938

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4939

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

4940

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

4941

*/

4942

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

4943

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4944

#if defined(BETA)

4945

IMAGE_DECLARATION(src2),

4946

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4947

IMAGE_DECLARATION(dst),

4948

uint src0_stride_z,

4949

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

4950

#if defined(BETA)

4951

uint src2_stride_z,

4952

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

4953

uint dst_stride_z

4954

#if defined(REINTERPRET_INPUT_AS_3D)

4955

,

4956

uint src_cross_plane_pad

4957

#endif // REINTERPRET_INPUT_AS_3D

4958

#if defined(REINTERPRET_OUTPUT_AS_3D)

4959

,

4960

uint dst_cross_plane_pad

4961

#endif // REINTERPRET_OUTPUT_AS_3D

4962

)

4963

{

4964

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

4965

4966

// Compute starting address for matrix A and Matrix B

4967

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

4968

4969

// Update address for the matrix A

4970

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

4971

4972

// Update address for the matrix B

4973

src_addr.s1 += idx * sizeof(half);

4974

4975

#if defined(REINTERPRET_INPUT_AS_3D)

4976

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

4977

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

4983

// |******************|

4984

// | cross_plane_pad |

4985

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

4990

4991

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

4992

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

4993

zin = min(DEPTH_GEMM3D - 1, zin);

4994

4995

// Add offset due to the cross plane paddings

4996

zin *= (src_cross_plane_pad * src0_stride_y);

4997

4998

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4999

// multiply src0_stride_z by DEPTH_GEMM3D

5000

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5001

5002

#else // defined(REINTERPRET_INPUT_AS_3D)

5003

5004

// Add offset for batched GEMM

5005

src_addr.s0 += get_global_id(2) * src0_stride_z;

5006

5007

#endif // defined(REINTERPRET_INPUT_AS_3D)

5008

5009

#if defined(MATRIX_B_DEPTH)

5010

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5011

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5012

#else // defined(MATRIX_B_DEPTH)

5013

src_addr.s1 += get_global_id(2) * src1_stride_z;

5014

#endif // defined(MATRIX_B_DEPTH)

5015

5016

float8 acc0 = 0.0h;

5017

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5018

float8 acc1 = 0.0h;

5019

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5020

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5021

float8 acc2 = 0.0h;

5022

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5023

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5024

float8 acc3 = 0.0h;

5025

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5026

5027

int i = 0;

5028

for(; i <= ((int)COLS_A - 4); i += 4)

5029

{

5030

#if defined(REINTERPRET_INPUT_AS_3D)

5031

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5032

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5033

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5034

// Load values from matrix A

5035

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5036

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5037

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5038

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5039

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5040

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5041

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5042

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5043

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5044

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5045

#endif // defined(REINTERPRET_INPUT_AS_3D)

5046

5047

// Load values from matrix B

5048

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5049

src_addr.s1 += src1_stride_y;

5050

5051

// Accumulate

5052

acc0 = fma(b0, (float8)a0.s0, acc0);

5053

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5054

acc1 = fma(b0, (float8)a1.s0, acc1);

5055

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5056

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5057

acc2 = fma(b0, (float8)a2.s0, acc2);

5058

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5059

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5060

acc3 = fma(b0, (float8)a3.s0, acc3);

5061

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5062

5063

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5064

src_addr.s1 += src1_stride_y;

5065

acc0 = fma(b0, (float8)a0.s1, acc0);

5066

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5067

acc1 = fma(b0, (float8)a1.s1, acc1);

5068

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5069

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5070

acc2 = fma(b0, (float8)a2.s1, acc2);

5071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5072

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5073

acc3 = fma(b0, (float8)a3.s1, acc3);

5074

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5075

5076

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5077

src_addr.s1 += src1_stride_y;

5078

acc0 = fma(b0, (float8)a0.s2, acc0);

5079

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5080

acc1 = fma(b0, (float8)a1.s2, acc1);

5081

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5082

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5083

acc2 = fma(b0, (float8)a2.s2, acc2);

5084

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5085

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5086

acc3 = fma(b0, (float8)a3.s2, acc3);

5087

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5088

5089

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5090

src_addr.s1 += src1_stride_y;

5091

acc0 = fma(b0, (float8)a0.s3, acc0);

5092

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5093

acc1 = fma(b0, (float8)a1.s3, acc1);

5094

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5095

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5096

acc2 = fma(b0, (float8)a2.s3, acc2);

5097

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5098

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5099

acc3 = fma(b0, (float8)a3.s3, acc3);

5100

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5101

5102

src_addr.s0 += 4 * sizeof(half);

5103

}

5104

5105

for(; i < (int)COLS_A; ++i)

5106

{

5107

#if defined(REINTERPRET_INPUT_AS_3D)

5108

// Load values from matrix A

5109

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5110

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5111

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5112

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5113

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5114

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5115

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5116

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5117

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5118

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5119

#else // defined(REINTERPRET_INPUT_AS_3D)

5120

// Load values from matrix A

5121

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5122

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5123

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5124

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5125

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5126

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5127

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5128

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5129

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5130

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5131

#endif // defined(REINTERPRET_INPUT_AS_3D)

5132

5133

// Load values from matrix B

5134

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

5135

5136

src_addr += (int2)(sizeof(half), src1_stride_y);

5137

5138

// Accumulate

5139

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

5140

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5141

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

5142

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5143

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5144

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

5145

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5146

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5147

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

5148

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5149

}

5150

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5151

int z = get_global_id(2);

5152

5153

// Compute destination address

5154

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5155

5156

// Compute dst address

5157

__global uchar *dst_addr = offset(&dst, 0, 0);

5158

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5159

uint4 zout = 0;

5160

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5161

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5162

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5163

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5164

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5170

// |******************|

5171

// | cross_plane_pad |

5172

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5177

5178

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5179

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5180

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5181

5182

// Add offset due to the cross plane paddings

5183

zout *= (dst_cross_plane_pad * dst_stride_y);

5184

5185

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5186

// multiply dst_stride_z by DEPTH_GEMM3D

5187

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5188

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5189

// Add offset for batched GEMM

5190

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5191

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5192

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5193

// Multiply by the weight of matrix-matrix product and store the result

5194

#if defined(ALPHA)

5195

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

5196

#endif // defined(ALPHA)

5197

5198

#if defined(BETA)

5199

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5200

5201

#if defined(BROADCAST_BIAS)

5202

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5203

5204

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5205

5206

float8 bias_f0 = convert_float8(bias0);

5207

5208

#ifndef UNIT_BETA

5209

SCALE_BLOCK(1, float, bias_f, BETA);

5210

#endif // UNIT_BIAS

5211

5212

// acc = acc + bias[broadcasted]

5213

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

5214

5215

#else // defined(BROADCAST_BIAS)

5216

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

5217

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5218

5219

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5220

5221

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5222

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5223

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5224

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5226

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5229

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5230

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5231

5232

#ifndef UNIT_BETA

5233

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

5238

5239

#endif // defined(BROADCAST_BIAS)

5240

#endif // defined(BETA)

5241

5242

half8 acc_h0 = convert_half8(acc0);

5243

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5244

half8 acc_h1 = convert_half8(acc1);

5245

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5246

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5247

half8 acc_h2 = convert_half8(acc2);

5248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5249

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5250

half8 acc_h3 = convert_half8(acc3);

5251

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5252

5253

#if defined(ACTIVATION_TYPE)

5254

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

5255

#endif // defined(ACTIVATION_TYPE)

5256

5257

// Store the output block

5258

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

5259

}

5260

5261

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

5262

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5263

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

5264

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5265

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5266

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5267

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5268

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5269

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5270

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5271

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5272

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5273

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5274

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5275

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5276

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5277

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5278

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5279

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5280

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5281

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5282

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5283

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5284

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5285

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5286

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5287

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5288

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5289

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5290

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5291

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5292

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5293

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5294

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5295

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5296

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5297

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5298

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5299

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5300

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5301

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5302

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5303

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5304

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5305

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5306

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5307

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5308

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5309

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5310

*/

5311

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

5312

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5313

#if defined(BETA)

5314

IMAGE_DECLARATION(src2),

5315

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5316

IMAGE_DECLARATION(dst),

5317

uint src0_stride_z,

5318

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5319

#if defined(BETA)

5320

uint src2_stride_z,

5321

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5322

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5323

#if defined(REINTERPRET_INPUT_AS_3D)

5324

,

5325

uint src_cross_plane_pad

5326

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5327

#if defined(REINTERPRET_OUTPUT_AS_3D)

5328

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5329

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5330

#endif // REINTERPRET_OUTPUT_AS_3D

5331

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5332

{

5333

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5334

5335

// Compute starting address for matrix A and Matrix B

5336

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5337

5338

// Update address for the matrix A

5339

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5340

5341

// Update address for the matrix B

5342

src_addr.s1 += idx * sizeof(half);

5343

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5344

#if defined(REINTERPRET_INPUT_AS_3D)

5345

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5346

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5352

// |******************|

5353

// | cross_plane_pad |

5354

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5359

5360

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5361

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5362

zin = min(DEPTH_GEMM3D - 1, zin);

5363

5364

// Add offset due to the cross plane paddings

5365

zin *= (src_cross_plane_pad * src0_stride_y);

5366

5367

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5368

// multiply src0_stride_z by DEPTH_GEMM3D

5369

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5370

5371

#else // defined(REINTERPRET_INPUT_AS_3D)

5372

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5373

// Add offset for batched GEMM

5374

src_addr.s0 += get_global_id(2) * src0_stride_z;

5375

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5376

#endif // defined(REINTERPRET_INPUT_AS_3D)

5377

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5378

#if defined(MATRIX_B_DEPTH)

5379

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5380

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5381

#else // defined(MATRIX_B_DEPTH)

5382

src_addr.s1 += get_global_id(2) * src1_stride_z;

5383

#endif // defined(MATRIX_B_DEPTH)

5384

5385

half8 acc0 = 0.0h;

5386

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5387

half8 acc1 = 0.0h;

5388

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5389

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5390

half8 acc2 = 0.0h;

5391

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5392

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5393

half8 acc3 = 0.0h;

5394

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5395

5396

int i = 0;

5397

for(; i <= ((int)COLS_A - 4); i += 4)

5398

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5399

#if defined(REINTERPRET_INPUT_AS_3D)

5400

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5401

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5402

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5403

// Load values from matrix A

5404

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5405

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5406

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5407

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5408

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5409

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5410

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5411

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5412

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5413

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5414

#endif // defined(REINTERPRET_INPUT_AS_3D)

5415

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5416

// Load values from matrix B

5417

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5418

src_addr.s1 += src1_stride_y;

5419

5420

// Accumulate

5421

acc0 = fma(b0, (half8)a0.s0, acc0);

5422

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5423

acc1 = fma(b0, (half8)a1.s0, acc1);

5424

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5425

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5426

acc2 = fma(b0, (half8)a2.s0, acc2);

5427

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5428

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5429

acc3 = fma(b0, (half8)a3.s0, acc3);

5430

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5431

5432

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5433

src_addr.s1 += src1_stride_y;

5434

acc0 = fma(b0, (half8)a0.s1, acc0);

5435

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5436

acc1 = fma(b0, (half8)a1.s1, acc1);

5437

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5438

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5439

acc2 = fma(b0, (half8)a2.s1, acc2);

5440

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5441

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5442

acc3 = fma(b0, (half8)a3.s1, acc3);

5443

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5444

5445

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5446

src_addr.s1 += src1_stride_y;

5447

acc0 = fma(b0, (half8)a0.s2, acc0);

5448

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5449

acc1 = fma(b0, (half8)a1.s2, acc1);

5450

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5451

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5452

acc2 = fma(b0, (half8)a2.s2, acc2);

5453

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5454

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5455

acc3 = fma(b0, (half8)a3.s2, acc3);

5456

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5457

5458

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5459

src_addr.s1 += src1_stride_y;

5460

acc0 = fma(b0, (half8)a0.s3, acc0);

5461

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5462

acc1 = fma(b0, (half8)a1.s3, acc1);

5463

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5464

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5465

acc2 = fma(b0, (half8)a2.s3, acc2);

5466

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5467

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5468

acc3 = fma(b0, (half8)a3.s3, acc3);

5469

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5470

5471

src_addr.s0 += 4 * sizeof(half);

5472

}

5473

5474

for(; i < (int)COLS_A; ++i)

5475

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5476

#if defined(REINTERPRET_INPUT_AS_3D)

5477

// Load values from matrix A

5478

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5479

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5480

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5481

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5482

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5483

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5484

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5485

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5486

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5487

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5488

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5489

// Load values from matrix A

5490

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5491

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5492

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5493

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5494

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5495

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5496

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5497

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5498

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5499

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5500

#endif // defined(REINTERPRET_INPUT_AS_3D)

5501

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5502

// Load values from matrix B

5503

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

5504

5505

src_addr += (int2)(sizeof(half), src1_stride_y);

5506

5507

// Accumulate

5508

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

5509

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5510

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

5511

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5512

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5513

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

5514

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5515

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5516

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

5517

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5518

}

5519

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5520

int z = get_global_id(2);

5521

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5522

// Compute destination address

5523

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5524

5525

// Compute dst address

5526

__global uchar *dst_addr = offset(&dst, 0, 0);

5527

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5528

uint4 zout = 0;

5529

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5530

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5531

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5532

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5533

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5534

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5539

// |******************|

5540

// | cross_plane_pad |

5541

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5546

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5547

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5548

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5549

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5550

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5551

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5552

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5553

5554

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5555

// multiply dst_stride_z by DEPTH_GEMM3D

5556

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5557

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5558

// Add offset for batched GEMM

5559

dst_addr += z * dst_stride_z;

5560

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5561

5562

// Multiply by the weight of matrix-matrix product and store the result

5563

#if defined(ALPHA)

5564

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

5565

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5570

5571

#if defined(BROADCAST_BIAS)

5572

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5573

5574

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5575

5576

#ifndef UNIT_BETA

5577

SCALE_BLOCK(1, half, bias, BETA);

5578

#endif // UNIT_BIAS

5579

5580

// acc = acc + bias[broadcasted]

5581

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5582

5583

#else // defined(BROADCAST_BIAS)

5584

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

5585

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5586

5587

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5588

5589

#ifndef UNIT_BETA

5590

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5595

5596

#endif // defined(BROADCAST_BIAS)

5597

#endif // defined(BETA)

5598

5599

#if defined(ACTIVATION_TYPE)

5600

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

5601

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5602

5603

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5604

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5605

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

5606

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

5607

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5608

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5609

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5610

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5611

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

5612

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

5613

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5614

*

5615

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

5616

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

5617

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5618

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

5619

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5620

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

5621

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5622

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5623

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5624

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5625

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5626

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5627

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5628

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5629

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5630

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5631

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5632

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

5633

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5634

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5635

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5636

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

5637

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5638

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5639

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5640

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

5641

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5642

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5643

float4 c = vload4(0, (__global float *)src.ptr);

5644

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5645

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5646

float4 out = alpha_ab + (float4)BETA * c;

5647

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5648

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5649

vstore4(out, 0, (__global float *)dst.ptr);

5650

}

5651

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

5652

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5653

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

5654

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

5655

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5656

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5657

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

5658

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

5659

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5660

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

5661

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5662

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

5663

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5664

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5665

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5666

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5667

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5668

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5669

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5670

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5671

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5672

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5673

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5674

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

5675

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5676

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5677

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5678

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

5679

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5680

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5681

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5682

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

5683

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5684

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5685

half8 c = vload8(0, (__global half *)src.ptr);

5686

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5687

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5688

half8 out = alpha_ab + (half8)BETA * c;

5689

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5690

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5691

vstore8(out, 0, (__global half *)dst.ptr);

5692

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

5693

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5694

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5695

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5696

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5697

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

5698

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

5699

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5700

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

5701

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5702

*

5703

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

5704

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5705

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5706

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5707

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5708

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5709

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5710

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5711

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5712

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5713

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5714

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5715

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

5716

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5717

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5718

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5719

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5720

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5721

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5722

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5723

*/

5724

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

5725

TENSOR3D_DECLARATION(src1),

5726

IMAGE_DECLARATION(dst))

5727

{

5728

int idx = get_global_id(0) * 4;

5729

int idy = get_global_id(1);

5730

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5731

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5732

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

5733

src_addr.s1 += idx * sizeof(float);

5734

5735

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

5739

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5740

{

5741

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

5742

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

5743

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

5744

5745

acc += b0 * (float4)a0.s0;

5746

acc += b1 * (float4)a0.s1;

5747

}

5748

5749

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

5750

{

5751

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

5752

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

5753

5754

acc += b0 * (float4)a0;

5755

}

5756

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5757

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5758

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5759

5760

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

5761

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5762

#endif // defined(WIDTH_VECTOR_A)

5763

5764

/** This kernel accumulates each row with the biases vector.

5765

*

5766

* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.

5767

* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.

5768

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

5769

* @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5770

* @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)

5771

* @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)

5772

* @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)

5773

* @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5774

* @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor

5775

* @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr

5776

* @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)

5777

* @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5778

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor

5779

*/

5780

#if defined(DATA_TYPE) && defined(VECTOR_SIZE)

5781

__kernel void gemm_accumulate_biases(

5782

IMAGE_DECLARATION(accum),

5783

VECTOR_DECLARATION(biases))

5784

{

5785

Image accum = CONVERT_TO_IMAGE_STRUCT(accum);

5786

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

5787

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame^]

5788

// Vector size, e.g. number of vector elements.

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5789

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

5790

accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);

5791

VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)

5792

biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

5793

accum_value = biases_value + accum_value;

Anton Lokhmotov