Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-12-06 17:13:09 +0000

[diff] [blame]

45

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

46

#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \

47

({ \

48

if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \

49

{ \

50

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

51

{ \

52

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

else \

{ \

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

62

{ \

63

LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

})

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

72

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in

73

* the output matrix unrolling the values.

74

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

75

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

76

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

77

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

78

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

79

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

80

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

81

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

82

* @note Only the following values for M0, K0 and V0 are supported:

83

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

84

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

85

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

86

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

87

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

88

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

89

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

90

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

91

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

92

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

93

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

95

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

96

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

97

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

98

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

99

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

100

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

101

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

102

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

103

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

104

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

105

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

106

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

107

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

108

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

109

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

110

*/

111

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

112

TENSOR3D_DECLARATION(dst)

113

#if defined(REINTERPRET_INPUT_AS_3D)

114

,

115

uint cross_plane_pad

116

#endif // REINTERPRET_INPUT_AS_3D

117

)

118

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

119

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

120

#define BLOCK_SIZE ((M0) * (K0))

121

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

122

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

#if defined(INTERLEAVE)

124

#define OUTPUT_OFFSET_X (K0)

125

#else // defined(INTERLEAVE)

126

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

127

#endif // defined(INTERLEAVE)

128

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

129

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

130

#if defined(INTERLEAVE)

131

#define OUTPUT_STEP_X (K0) * (V0)

132

#else // Do not interleave

133

#define OUTPUT_STEP_X (K0)

134

#endif // defined(INTERLEAVE)

135

136

// Compute source and destination addresses

137

uint x = get_global_id(0);

138

uint y = get_global_id(1);

139

uint z = get_global_id(2);

140

141

// ------------------ Compute input/output addresses ---------------------------

142

143

// Compute the input address

144

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

145

146

// Compute the output address

147

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

148

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

149

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

150

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

151

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

152

153

#if defined(REINTERPRET_INPUT_AS_3D)

154

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

155

// multiply src_stride_z by DEPTH_GEMM3D

156

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

158

159

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

160

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

161

162

#else // defined(REINTERPRET_INPUT_AS_3D)

163

164

input_ptr += z * (uint)src_stride_z;

165

166

#endif // defined(REINTERPRET_INPUT_AS_3D)

167

168

// Add offset for batched GEMM

169

output_ptr += z * (uint)dst_stride_z;

170

171

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

172

// Load values from the LHS matrix

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

173

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

174

175

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

176

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

177

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

178

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

179

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

180

181

#undef BLOCK_SIZE

182

#undef OUTPUT_OFFSET_X

183

#undef OUTPUT_STEP_X

184

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

185

186

#if M0 == 2

187

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

188

({ \

189

VEC_DATA_TYPE(DATA_TYPE, M0) \

190

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

191

VSTORE(M0) \

192

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

193

})

194

#elif M0 == 3 // M0 == 3

195

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

196

({ \

197

VEC_DATA_TYPE(DATA_TYPE, M0) \

198

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

199

VSTORE(M0) \

200

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

201

})

202

#elif M0 == 4 // M0 == 4

203

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

204

({ \

205

VEC_DATA_TYPE(DATA_TYPE, M0) \

206

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

207

VSTORE(M0) \

208

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

209

})

210

#elif M0 == 5 // M0 == 5

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

DATA_TYPE res1 = a4.s##i; \

216

VSTORE(4) \

217

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

218

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

219

})

220

#elif M0 == 6 // M0 == 6

221

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

222

({ \

223

VEC_DATA_TYPE(DATA_TYPE, 4) \

224

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

225

VEC_DATA_TYPE(DATA_TYPE, 2) \

226

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

227

VSTORE(4) \

228

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

229

VSTORE(2) \

230

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

231

})

232

#elif M0 == 7 // M0 == 7

233

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

234

({ \

235

VEC_DATA_TYPE(DATA_TYPE, 4) \

236

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

237

VEC_DATA_TYPE(DATA_TYPE, 3) \

238

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

239

VSTORE(4) \

240

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

VSTORE(3) \

242

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

243

})

244

#elif M0 == 8 // M0 == 8

245

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

246

({ \

247

VEC_DATA_TYPE(DATA_TYPE, M0) \

248

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

249

VSTORE(M0) \

250

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

251

})

252

#else // M0 not supported

253

#error "M0 value not supported"

254

#endif // N0 conditions

255

256

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

257

* the output matrix unrolling the values.

258

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

259

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

260

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

261

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

262

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

263

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

264

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

265

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

266

* @note Only the following values for M0, K0 and V0 are supported:

267

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

268

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

269

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

270

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

271

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

272

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

273

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

274

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

275

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

276

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

277

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

278

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

279

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

280

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

281

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

282

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

283

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

284

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

285

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

286

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

287

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

288

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

289

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

290

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

291

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

292

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

293

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

294

*/

295

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

296

TENSOR3D_DECLARATION(dst)

297

#if defined(REINTERPRET_INPUT_AS_3D)

298

,

299

uint cross_plane_pad

300

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

305

306

// Output offset X

307

#if defined(INTERLEAVE)

308

#define OUTPUT_OFFSET_X (M0)

309

#else // defined(INTERLEAVE)

310

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

311

#endif // defined(INTERLEAVE)

312

313

// Output step X

314

#if defined(INTERLEAVE)

315

#define OUTPUT_STEP_X (M0) * (V0)

316

#else // Do not interleave

317

#define OUTPUT_STEP_X (M0)

318

#endif // defined(INTERLEAVE)

319

320

// Compute source and destination addresses

321

uint x = get_global_id(0);

322

uint y = get_global_id(1);

323

uint z = get_global_id(2);

324

325

// ------------------ Compute input/output addresses ---------------------------

326

327

// Compute the input address

328

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

329

330

// Compute the output address

331

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

332

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

333

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

334

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

335

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

336

337

#if defined(REINTERPRET_INPUT_AS_3D)

338

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

339

// multiply src_stride_z by DEPTH_GEMM3D

340

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

341

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

342

343

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

344

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

345

346

#else // defined(REINTERPRET_INPUT_AS_3D)

347

348

input_ptr += z * (uint)src_stride_z;

349

350

#endif // defined(REINTERPRET_INPUT_AS_3D)

351

352

// Add offset for batched GEMM

353

output_ptr += z * (uint)dst_stride_z;

354

355

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

356

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

357

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

358

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

359

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

360

// ---------------------------Transpose and store block -----------------------

361

362

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

363

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

364

#if K0 > 2

365

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

366

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

367

#if K0 > 3

368

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

369

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

370

#if K0 > 4

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

372

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

374

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

375

#endif // K0 > 4

376

#if K0 > 8

377

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

378

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

383

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

384

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

389

#undef OUTPUT_STEP_X

390

}

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame]

391

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

392

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

393

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

394

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

395

* the output matrix unrolling the values.

396

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

397

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

398

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

399

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

400

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

402

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

403

* N0: 2,3,4,8,16

404

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

405

* H0: greater than 0

406

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

407

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

408

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

409

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

410

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

411

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

412

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

413

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

414

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

415

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

416

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

417

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

419

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

421

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

423

*/

424

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

425

TENSOR3D_DECLARATION(dst))

426

{

427

// Block size

428

#define BLOCK_SIZE ((K0) * (N0))

429

430

// Output offset X

431

#if defined(INTERLEAVE)

432

#define OUTPUT_OFFSET_X (N0)

433

#else // defined(INTERLEAVE)

434

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

435

#endif // defined(INTERLEAVE)

436

437

// Output step X

438

#if defined(INTERLEAVE)

439

#define OUTPUT_STEP_X (N0) * (H0)

440

#else // Do not interleave

441

#define OUTPUT_STEP_X (N0)

442

#endif // defined(INTERLEAVE)

443

444

// Compute source and destination addresses

445

uint x = get_global_id(0);

446

uint y = get_global_id(1);

447

uint z = get_global_id(2);

448

449

// ------------------ Compute input/output addresses ---------------------------

450

451

// Compute the input address

452

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

453

454

// Compute the output address

455

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

456

x / (uint)H0)

457

* (uint)dst_stride_y)

458

+ z * (uint)dst_stride_z;

459

460

// ---------------------------Load input values --------------------------------

461

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

462

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

463

464

// Load values from the RHS matrix

465

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

466

#if K0 > 1

467

if(y * (uint)K0 + 1 < SRC_HEIGHT)

468

{

469

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

474

{

475

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

476

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

477

#endif // K0 > 2

478

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

479

if(y * (uint)K0 + 3 < SRC_HEIGHT)

480

{

481

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

482

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

483

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

484

#if K0 > 4

485

if(y * (uint)K0 + 4 < SRC_HEIGHT)

486

{

487

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

488

}

489

if(y * (uint)K0 + 5 < SRC_HEIGHT)

490

{

491

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

492

}

493

if(y * (uint)K0 + 6 < SRC_HEIGHT)

494

{

495

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

496

}

497

if(y * (uint)K0 + 7 < SRC_HEIGHT)

498

{

499

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

500

}

501

#endif // K0 > 4

502

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

503

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

504

{

505

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

506

}

507

if(y * (uint)K0 + 9 < SRC_HEIGHT)

508

{

509

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

510

}

511

if(y * (uint)K0 + 10 < SRC_HEIGHT)

512

{

513

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

514

}

515

if(y * (uint)K0 + 11 < SRC_HEIGHT)

516

{

517

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

518

}

519

if(y * (uint)K0 + 12 < SRC_HEIGHT)

520

{

521

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

522

}

523

if(y * (uint)K0 + 13 < SRC_HEIGHT)

524

{

525

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

526

}

527

if(y * (uint)K0 + 14 < SRC_HEIGHT)

528

{

529

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

530

}

531

if(y * (uint)K0 + 15 < SRC_HEIGHT)

532

{

533

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

538

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

539

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

540

541

#undef BLOCK_SIZE

542

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

547

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

548

* the output matrix unrolling the values.

549

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

550

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

551

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

552

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

553

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

554

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

555

* @note The option -DTRANSPOSE must passed at compile time.

556

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

557

* N0: 2,3,4,8,16

558

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

559

* H0: greater than 0

560

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

561

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

563

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

564

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

565

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

566

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

567

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

568

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

569

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

575

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

577

*/

578

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

579

TENSOR3D_DECLARATION(dst))

580

{

581

// Block size

582

#define BLOCK_SIZE ((K0) * (N0))

583

584

// Output offset X

585

#if defined(INTERLEAVE)

586

#define OUTPUT_OFFSET_X (K0)

587

#else // defined(INTERLEAVE)

588

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

589

#endif // defined(INTERLEAVE)

590

591

// Output step X

592

#if defined(INTERLEAVE)

593

#define OUTPUT_STEP_X (K0) * (H0)

594

#else // Do not interleave

595

#define OUTPUT_STEP_X (K0)

596

#endif // defined(INTERLEAVE)

597

598

// Compute source and destination addresses

599

uint x = get_global_id(0);

600

uint y = get_global_id(1);

601

uint z = get_global_id(2);

602

603

// ------------------ Compute input/output addresses ---------------------------

604

605

// Compute the input address

606

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

607

608

// Compute the output address

609

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

610

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

611

612

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

613

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

614

615

// Load values from the RHS matrix

616

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

617

if(y * (uint)K0 + 1 < SRC_HEIGHT)

618

{

619

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

620

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

621

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

if(y * (uint)K0 + 2 < SRC_HEIGHT)

623

{

624

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

625

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

626

#endif // K0 > 2

627

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

628

if(y * (uint)K0 + 3 < SRC_HEIGHT)

629

{

630

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

631

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

632

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

633

#if K0 > 4

634

if(y * (uint)K0 + 4 < SRC_HEIGHT)

635

{

636

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

637

}

638

if(y * (uint)K0 + 5 < SRC_HEIGHT)

639

{

640

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

641

}

642

if(y * (uint)K0 + 6 < SRC_HEIGHT)

643

{

644

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

645

}

646

if(y * (uint)K0 + 7 < SRC_HEIGHT)

647

{

648

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

649

}

650

#endif // K0 > 4

651

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

652

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

653

{

654

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

655

}

656

if(y * (uint)K0 + 9 < SRC_HEIGHT)

657

{

658

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

659

}

660

if(y * (uint)K0 + 10 < SRC_HEIGHT)

661

{

662

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

663

}

664

if(y * (uint)K0 + 11 < SRC_HEIGHT)

665

{

666

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

667

}

668

if(y * (uint)K0 + 12 < SRC_HEIGHT)

669

{

670

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

671

}

672

if(y * (uint)K0 + 13 < SRC_HEIGHT)

673

{

674

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

675

}

676

if(y * (uint)K0 + 14 < SRC_HEIGHT)

677

{

678

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

679

}

680

if(y * (uint)K0 + 15 < SRC_HEIGHT)

681

{

682

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

687

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

688

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

689

#if K0 == 2

690

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

696

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

697

#if N0 > 2

698

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

699

#endif // N0 > 2

700

#if N0 > 3

701

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

702

#endif // N0 > 3

703

#if N0 > 4

704

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

705

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

706

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

707

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

708

#endif // N0 > 4

709

#if N0 > 8

710

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

711

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

712

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

713

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

714

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

715

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

716

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

717

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

718

#endif // N0 > 8

719

720

#elif K0 == 3 // K0 == 2

721

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

726

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

727

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

728

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

729

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

730

#endif // N0 > 2

731

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

732

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

733

#endif // N0 > 3

734

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

735

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

736

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

737

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

738

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

739

#endif // N0 > 4

740

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

741

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

742

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

743

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

744

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

745

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

746

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

747

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

748

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

749

#endif // N0 > 8

750

751

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

752

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

758

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

759

#if N0 > 2

760

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

761

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

762

#if N0 > 3

763

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

764

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

765

#if N0 > 4

766

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

767

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

768

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

769

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

770

#endif // N0 > 4

771

#if N0 > 8

772

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

773

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

774

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

775

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

776

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

777

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

778

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

779

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

780

#endif // N0 > 8

781

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

782

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

783

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

788

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

789

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

790

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

791

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

792

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

793

#if N0 > 3

794

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

795

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

796

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

797

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

798

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

799

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

800

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

801

#endif // N0 > 4

802

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

803

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

804

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

805

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

806

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

807

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

808

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

809

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

810

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

811

#endif // N0 > 8

812

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

813

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

814

815

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

821

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

822

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

823

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

824

#if N0 > 2

825

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

826

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

827

#endif // N0 > 2

828

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

829

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

830

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

831

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

832

#if N0 > 4

833

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

834

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

835

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

836

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

837

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

838

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

839

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

840

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

841

#endif // N0 > 4

842

#if N0 > 8

843

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

844

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

845

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

846

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

847

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

848

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

849

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

850

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

851

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

852

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

853

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

854

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

855

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

856

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

857

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

858

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

863

#endif // N0 > 2

864

865

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

866

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

867

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

868

869

#undef BLOCK_SIZE

870

#undef OUTPUT_OFFSET_X

871

#undef OUTPUT_STEP_X

872

}

873

#endif // defined(TRANSPOSE)

874

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

875

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

876

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

877

878

#define CONCAT(a, b) a##b

879

880

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

885

({ \

886

c = fma(a.s0, b.s0, c); \

887

c = fma(a.s1, b.s1, c); \

888

})

889

#define ARM_DOT3(a, b, c) \

890

({ \

891

ARM_DOT2(a, b, c); \

892

c = fma((a.s2), (b.s2), c); \

893

})

894

#define ARM_DOT4(a, b, c) \

895

({ \

896

ARM_DOT3(a, b, c); \

897

c = fma((a.s3), (b.s3), c); \

898

})

899

#define ARM_DOT8(a, b, c) \

900

({ \

901

ARM_DOT4((a.lo), (b.lo), c); \

902

ARM_DOT4((a.hi), (b.hi), c); \

903

})

904

#define ARM_DOT16(a, b, c) \

905

({ \

906

ARM_DOT8((a.lo), (b.lo), c); \

907

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

912

({ \

913

CONCAT(ARM_DOT, k0) \

914

((a), (b##0), (c.s0)); \

915

CONCAT(ARM_DOT, k0) \

916

((a), (b##1), (c.s1)); \

917

})

918

#elif N0 == 3 // N0 == 3

919

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

CONCAT(ARM_DOT, k0) \

926

((a), (b##2), (c.s2)); \

927

})

928

#elif N0 == 4 // N0 == 4

929

#define ARM_DOT_K0XN0(k0, a, b, c) \

930

({ \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##0), (c.s0)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##1), (c.s1)); \

935

CONCAT(ARM_DOT, k0) \

936

((a), (b##2), (c.s2)); \

937

CONCAT(ARM_DOT, k0) \

938

((a), (b##3), (c.s3)); \

939

})

940

#elif N0 == 8 // N0 == 8

941

#define ARM_DOT_K0XN0(k0, a, b, c) \

942

({ \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##0), (c.s0)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##1), (c.s1)); \

947

CONCAT(ARM_DOT, k0) \

948

((a), (b##2), (c.s2)); \

949

CONCAT(ARM_DOT, k0) \

950

((a), (b##3), (c.s3)); \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##4), (c.s4)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##5), (c.s5)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##6), (c.s6)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##7), (c.s7)); \

959

})

960

#elif N0 == 16 // N0 == 16

961

#define ARM_DOT_K0XN0(k0, a, b, c) \

962

({ \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##0), (c.s0)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##1), (c.s1)); \

967

CONCAT(ARM_DOT, k0) \

968

((a), (b##2), (c.s2)); \

969

CONCAT(ARM_DOT, k0) \

970

((a), (b##3), (c.s3)); \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##4), (c.s4)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##5), (c.s5)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##6), (c.s6)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##7), (c.s7)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##8), (c.s8)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##9), (c.s9)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##A), (c.sA)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##B), (c.sB)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##C), (c.sC)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##D), (c.sD)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##E), (c.sE)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##F), (c.sF)); \

995

})

996

#else // N0 not supported

997

#error "N0 value not supported"

998

#endif // N0 conditions

999

1000

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1001

* The LHS matrix is NOT reshaped

1002

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1003

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1004

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1005

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1006

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1007

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1008

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1009

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1010

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1011

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1012

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1013

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1014

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1015

* - N0 = 2, 3, 4, 8, 16

1016

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1017

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1019

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1020

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1021

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1022

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1023

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1024

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1025

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1026

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1027

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1028

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1029

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1030

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1031

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1032

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1033

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1034

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1035

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1036

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1037

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1038

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1039

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1040

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1042

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1044

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1046

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1047

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1048

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1050

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1052

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1053

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1054

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1055

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1056

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1057

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1058

*/

1059

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1060

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

#if defined(BETA)

1062

IMAGE_DECLARATION(bias),

1063

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

IMAGE_DECLARATION(dst),

1065

uint lhs_stride_z,

1066

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

uint bias_stride_z,

1069

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

uint dst_stride_z

1071

#if defined(REINTERPRET_INPUT_AS_3D)

1072

,

1073

uint lhs_cross_plane_pad

1074

#endif // REINTERPRET_INPUT_AS_3D

1075

#if defined(REINTERPRET_OUTPUT_AS_3D)

1076

,

1077

uint dst_cross_plane_pad

1078

#endif // REINTERPRET_OUTPUT_AS_3D

1079

)

1080

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1081

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1082

#define RHS_BLOCK_SIZE ((K0) * (N0))

1083

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1084

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1085

#if defined(RHS_INTERLEAVE)

1086

#define RHS_OFFSET_X (K0)

1087

#define RHS_STEP_X ((K0) * (H0))

1088

#define RHS_STEP_LOOP (1)

1089

#else // defined(RHS_INTERLEAVE)

1090

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1091

#define RHS_STEP_X (K0)

1092

#define RHS_STEP_LOOP (H0)

1093

#endif // defined(RHS_INTERLEAVE)

1094

1095

uint x = get_global_id(0);

1096

uint y = get_global_id(1);

1097

uint z = get_global_id(2);

1098

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1099

#if defined(DUMMY_WORK_ITEMS)

1100

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1105

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1106

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1107

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1108

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1109

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1110

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1111

1112

#if defined(MATRIX_B_DEPTH)

1113

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1114

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1115

#else // defined(MATRIX_B_DEPTH)

1116

rhs_offset += z * rhs_stride_z;

1117

#endif // defined(MATRIX_B_DEPTH)

1118

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1119

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1120

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1121

1122

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1123

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1124

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1125

1126

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1127

// multiply lhs_stride_z by DEPTH_GEMM3D

1128

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1129

1130

#else // defined(REINTERPRET_INPUT_AS_3D)

1131

1132

// Add offset for batched GEMM

1133

lhs_offset += z * lhs_stride_z;

1134

1135

#endif // defined(REINTERPRET_INPUT_AS_3D)

1136

1137

// Initialize the accumulators

1138

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1139

1140

int i = 0;

1141

for(; i <= (K - K0); i += K0)

1142

{

1143

// Supported cases (M0, K0):

1144

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1145

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1146

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1147

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1148

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1149

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1150

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1151

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1152

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1153

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1154

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1155

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1156

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1157

1158

// Accumulate

1159

ARM_DOT_K0XN0(K0, a0, b, c0);

1160

#if M0 > 1

1161

ARM_DOT_K0XN0(K0, a1, b, c1);

1162

#endif // M0 > 1

1163

#if M0 > 2

1164

ARM_DOT_K0XN0(K0, a2, b, c2);

1165

#endif // M0 > 2

1166

#if M0 > 3

1167

ARM_DOT_K0XN0(K0, a3, b, c3);

1168

#endif // M0 > 3

1169

#if M0 > 4

1170

ARM_DOT_K0XN0(K0, a4, b, c4);

1171

#endif // M0 > 4

1172

#if M0 > 5

1173

ARM_DOT_K0XN0(K0, a5, b, c5);

1174

#endif // M0 > 5

1175

#if M0 > 6

1176

ARM_DOT_K0XN0(K0, a6, b, c6);

1177

#endif // M0 > 6

1178

#if M0 > 7

1179

ARM_DOT_K0XN0(K0, a7, b, c7);

1180

#endif // M0 > 7

1181

1182

lhs_offset += K0 * sizeof(DATA_TYPE);

1183

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1184

}

1185

1186

// Left-over accumulations

1187

for(; i < K; ++i)

1188

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1189

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1190

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1191

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1192

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1193

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1194

1195

// Accumulate

1196

ARM_DOT_K0XN0(1, a0, b, c0);

1197

#if M0 > 1

1198

ARM_DOT_K0XN0(1, a1, b, c1);

1199

#endif // M0 > 1

1200

#if M0 > 2

1201

ARM_DOT_K0XN0(1, a2, b, c2);

1202

#endif // M0 > 2

1203

#if M0 > 3

1204

ARM_DOT_K0XN0(1, a3, b, c3);

1205

#endif // M0 > 3

1206

#if M0 > 4

1207

ARM_DOT_K0XN0(1, a4, b, c4);

1208

#endif // M0 > 4

1209

#if M0 > 5

1210

ARM_DOT_K0XN0(1, a5, b, c5);

1211

#endif // M0 > 5

1212

#if M0 > 6

1213

ARM_DOT_K0XN0(1, a6, b, c6);

1214

#endif // M0 > 6

1215

#if M0 > 7

1216

ARM_DOT_K0XN0(1, a7, b, c7);

1217

#endif // M0 > 7

1218

1219

lhs_offset += sizeof(DATA_TYPE);

1220

rhs_offset += sizeof(DATA_TYPE);

1221

}

1222

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1223

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1224

1225

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1226

1227

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1228

1229

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1230

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1231

1232

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1233

// multiply dst_stride_z by DEPTH_GEMM3D

1234

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1235

1236

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1237

1238

// Add offset for batched GEMM

1239

dst_addr += z * dst_stride_z;

1240

1241

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1242

1243

// Multiply by the weight of matrix-matrix product and store the result

1244

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1245

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1246

#endif // defined(ALPHA)

1247

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1248

// Add beta*bias

1249

#if defined(BETA)

1250

#if defined(BROADCAST_BIAS)

1251

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1252

1253

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1254

1255

#ifndef UNIT_BETA

1256

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1257

#endif // UNIT_BIAS

1258

1259

// c = c + bias[broadcasted]

1260

ADD_BLOCK_BROADCAST(M0, c, bias0);

1261

1262

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1263

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1264

1265

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1266

1267

#ifndef UNIT_BETA

1268

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1273

1274

#endif // defined(BROADCAST_BIAS)

1275

#endif // defined(BETA)

1276

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1277

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1278

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1279

#endif // defined(ACTIVATION_TYPE)

1280

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1281

const bool cond_y = y == 0;

1282

const bool cond_x = ((x + 1) * N0 >= N);

1283

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1284

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

1285

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1286

1287

#undef RHS_BLOCK_SIZE

1288

#undef RHS_OFFSET_X

1289

#undef RHS_STEP_X

1290

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1291

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1292

#if defined(OPENCL_IMAGE_SUPPORT)

1293

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1294

* The LHS matrix is NOT reshaped

1295

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1296

*

1297

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1298

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1299

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1300

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1301

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1302

* could be different from the value returned by get_image_height(rhs_img).

1303

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1304

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1305

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1306

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1307

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1308

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1309

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1310

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1316

* The activation function is performed after the bias addition

1317

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1318

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1319

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1320

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1321

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1322

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1323

*

1324

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1325

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1326

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1327

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1328

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1329

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1330

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1331

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1332

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1333

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1334

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1335

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1336

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1337

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1338

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1339

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1340

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1341

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1342

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1343

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1344

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1345

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1346

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1347

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1348

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1349

*/

1350

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1351

__read_only image2d_t rhs_img,

1352

#if defined(BETA)

1353

IMAGE_DECLARATION(bias),

1354

#endif // defined(BETA)

1355

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1361

uint dst_stride_z

1362

#if defined(REINTERPRET_INPUT_AS_3D)

1363

,

1364

uint lhs_cross_plane_pad

1365

#endif // REINTERPRET_INPUT_AS_3D

1366

#if defined(REINTERPRET_OUTPUT_AS_3D)

1367

,

1368

uint dst_cross_plane_pad

1369

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1374

1375

#define LEFTOVER_K (K % K0)

1376

1377

// Block size

1378

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1379

1380

// RHS offset and step X

1381

#if defined(RHS_INTERLEAVE)

1382

#define RHS_OFFSET_X (PIXEL_UNIT)

1383

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1384

#define RHS_STEP_LOOP (1)

1385

#else // defined(RHS_INTERLEAVE)

1386

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1387

#define RHS_STEP_X PIXEL_UNIT

1388

#define RHS_STEP_LOOP (H0)

1389

#endif // defined(RHS_INTERLEAVE)

1390

1391

uint x = get_global_id(0);

1392

uint y = get_global_id(1);

1393

uint z = get_global_id(2);

1394

1395

#if defined(DUMMY_WORK_ITEMS)

1396

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1401

1402

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1403

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1404

1405

#if defined(MATRIX_B_DEPTH)

1406

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1407

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1408

#else // defined(MATRIX_B_DEPTH)

1409

const uint z_rhs = get_global_id(2);

1410

#endif // defined(MATRIX_B_DEPTH)

1411

1412

// Compute RHS matrix coordinates

1413

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1414

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1415

1416

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1417

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1418

1419

#if defined(REINTERPRET_INPUT_AS_3D)

1420

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1421

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1422

1423

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1424

// multiply lhs_stride_z by DEPTH_GEMM3D

1425

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1426

1427

#else // defined(REINTERPRET_INPUT_AS_3D)

1428

1429

// Add offset for batched GEMM

1430

lhs_offset += z * lhs_stride_z;

1431

1432

#endif // defined(REINTERPRET_INPUT_AS_3D)

1433

1434

// Initialize the accumulators

1435

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1436

1437

int i = 0;

1438

for(; i <= (K - K0); i += K0)

1439

{

1440

// Load values from LHS matrix

1441

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1442

1443

// Load values from RHS matrix stored in a cl_image

1444

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1445

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1446

1447

// Accumulate

1448

ARM_DOT_K0XN0(K0, a0, b, c0);

1449

#if M0 > 1

1450

ARM_DOT_K0XN0(K0, a1, b, c1);

1451

#endif // M0 > 1

1452

#if M0 > 2

1453

ARM_DOT_K0XN0(K0, a2, b, c2);

1454

#endif // M0 > 2

1455

#if M0 > 3

1456

ARM_DOT_K0XN0(K0, a3, b, c3);

1457

#endif // M0 > 3

1458

#if M0 > 4

1459

ARM_DOT_K0XN0(K0, a4, b, c4);

1460

#endif // M0 > 4

1461

#if M0 > 5

1462

ARM_DOT_K0XN0(K0, a5, b, c5);

1463

#endif // M0 > 5

1464

#if M0 > 6

1465

ARM_DOT_K0XN0(K0, a6, b, c6);

1466

#endif // M0 > 6

1467

#if M0 > 7

1468

ARM_DOT_K0XN0(K0, a7, b, c7);

1469

#endif // M0 > 7

1470

1471

lhs_offset += K0 * sizeof(DATA_TYPE);

1472

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1477

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1487

#if M0 > 1

1488

union UNION_VEC_TYPE a1 = {.v = 0 };

1489

#endif // M0 > 1

1490

#if M0 > 2

1491

union UNION_VEC_TYPE a2 = {.v = 0 };

1492

#endif // M0 > 2

1493

#if M0 > 3

1494

union UNION_VEC_TYPE a3 = {.v = 0 };

1495

#endif // M0 > 3

1496

#if M0 > 4

1497

union UNION_VEC_TYPE a4 = {.v = 0 };

1498

#endif // M0 > 4

1499

#if M0 > 5

1500

union UNION_VEC_TYPE a5 = {.v = 0 };

1501

#endif // M0 > 5

1502

#if M0 > 6

1503

union UNION_VEC_TYPE a6 = {.v = 0 };

1504

#endif // M0 > 6

1505

#if M0 > 7

1506

union UNION_VEC_TYPE a7 = {.v = 0 };

1507

#endif // M0 > 7

1508

1509

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1510

1511

// Load from RHS matrix

1512

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1513

1514

// Load from LHS matrix

1515

for(int k = 0; k < LEFTOVER_K; ++k)

1516

{

1517

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1518

#if M0 > 1

1519

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1520

#endif // M0 > 1

1521

#if M0 > 2

1522

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1523

#endif // M0 > 2

1524

#if M0 > 3

1525

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1526

#endif // M0 > 3

1527

#if M0 > 4

1528

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1529

#endif // M0 > 4

1530

#if M0 > 5

1531

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1532

#endif // M0 > 5

1533

#if M0 > 6

1534

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1535

#endif // M0 > 6

1536

#if M0 > 7

1537

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1538

#endif // M0 > 7

1539

1540

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1545

#if M0 > 1

1546

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1547

#endif // M0 > 1

1548

#if M0 > 2

1549

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1550

#endif // M0 > 2

1551

#if M0 > 3

1552

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1553

#endif // M0 > 3

1554

#if M0 > 4

1555

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1556

#endif // M0 > 4

1557

#if M0 > 5

1558

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1559

#endif // M0 > 5

1560

#if M0 > 6

1561

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1562

#endif // M0 > 6

1563

#if M0 > 7

1564

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1565

#endif // M0 > 7

1566

1567

#endif // LEFTOVER_K != 0

1568

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1569

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1570

1571

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1572

1573

#if defined(REINTERPRET_OUTPUT_AS_3D)

1574

1575

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1576

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1577

1578

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1579

// multiply dst_stride_z by DEPTH_GEMM3D

1580

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1581

1582

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1583

1584

// Add offset for batched GEMM

1585

dst_addr += z * dst_stride_z;

1586

1587

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1588

1589

// Multiply by the weight of matrix-matrix product and store the result

1590

#if defined(ALPHA)

1591

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1592

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1597

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1598

1599

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1600

1601

#ifndef UNIT_BETA

1602

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1603

#endif // UNIT_BIAS

1604

1605

// c = c + bias[broadcasted]

1606

ADD_BLOCK_BROADCAST(M0, c, bias0);

1607

1608

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1609

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1610

1611

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1612

1613

#ifndef UNIT_BETA

1614

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1619

1620

#endif // defined(BROADCAST_BIAS)

1621

#endif // defined(BETA)

1622

1623

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

1624

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1625

#endif // defined(ACTIVATION_TYPE)

1626

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1627

const bool cond_y = y == 0;

1628

const bool cond_x = ((x + 1) * N0 >= N);

1629

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1630

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

1631

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1632

1633

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1640

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1641

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1647

#define VFMA_M0xN0(i, a, b, c) \

1648

({ \

1649

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1650

})

1651

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1652

#define VFMA_M0xN0(i, a, b, c) \

1653

({ \

1654

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1655

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1656

})

1657

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1658

#define VFMA_M0xN0(i, a, b, c) \

1659

({ \

1660

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1661

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1663

})

1664

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1665

#define VFMA_M0xN0(i, a, b, c) \

1666

({ \

1667

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1668

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1669

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1670

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1671

})

1672

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1673

#define VFMA_M0xN0(i, a, b, c) \

1674

({ \

1675

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1676

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1677

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1678

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1680

})

1681

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1682

#define VFMA_M0xN0(i, a, b, c) \

1683

({ \

1684

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1685

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1686

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1687

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1690

})

1691

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1692

#define VFMA_M0xN0(i, a, b, c) \

1693

({ \

1694

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1695

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1696

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1697

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1698

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1701

})

1702

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1703

#define VFMA_M0xN0(i, a, b, c) \

1704

({ \

1705

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1706

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1707

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1708

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1709

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1710

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1711

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1712

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1713

})

1714

#else // M0 not supported

1715

#error "M0 not supported"

1716

#endif // M0 not supported

1717

1718

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1719

* The LHS matrix is NOT reshaped

1720

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1721

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1722

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1723

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1724

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1725

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1726

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1727

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1728

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1729

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1730

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1731

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1732

* - N0 = 2, 3, 4, 8, 16

1733

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1734

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1735

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1736

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1737

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1738

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1739

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1740

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1741

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1742

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1743

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1744

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1745

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1746

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1747

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1748

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1749

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1750

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1751

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1752

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1753

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1754

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1755

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1756

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1757

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1758

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1759

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1760

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1761

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1762

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1763

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1764

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1765

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1766

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1767

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1768

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1769

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1770

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1771

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1772

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1773

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1774

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1775

*/

1776

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1777

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1778

#if defined(BETA)

1779

IMAGE_DECLARATION(bias),

1780

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1781

IMAGE_DECLARATION(dst),

1782

uint lhs_stride_z,

1783

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1784

#if defined(BETA)

1785

uint bias_stride_z,

1786

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1787

uint dst_stride_z

1788

#if defined(REINTERPRET_INPUT_AS_3D)

1789

,

1790

uint lhs_cross_plane_pad

1791

#endif // REINTERPRET_INPUT_AS_3D

1792

#if defined(REINTERPRET_OUTPUT_AS_3D)

1793

,

1794

uint dst_cross_plane_pad

1795

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1800

1801

// RHS offset and step X

1802

#if defined(RHS_INTERLEAVE)

1803

#define RHS_OFFSET_X (N0)

1804

#define RHS_STEP_X ((N0) * (H0))

1805

#define RHS_STEP_LOOP (1)

1806

#else // defined(RHS_INTERLEAVE)

1807

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1808

#define RHS_STEP_X (N0)

1809

#define RHS_STEP_LOOP (H0)

1810

#endif // defined(RHS_INTERLEAVE)

1811

1812

uint x = get_global_id(0);

1813

uint y = get_global_id(1);

1814

uint z = get_global_id(2);

1815

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1816

#if defined(DUMMY_WORK_ITEMS)

1817

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1822

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1823

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1824

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1825

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1826

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1827

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1828

1829

#if defined(MATRIX_B_DEPTH)

1830

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1831

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1832

#else // defined(MATRIX_B_DEPTH)

1833

rhs_offset += z * rhs_stride_z;

1834

#endif // defined(MATRIX_B_DEPTH)

1835

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1836

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1837

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1838

1839

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1840

1841

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1842

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1843

1844

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1845

// multiply lhs_stride_z by DEPTH_GEMM3D

1846

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1847

1848

#else // defined(REINTERPRET_INPUT_AS_3D)

1849

1850

// Add offset for batched GEMM

1851

lhs_offset += z * lhs_stride_z;

1852

1853

#endif // defined(REINTERPRET_INPUT_AS_3D)

1854

1855

// Initialize the accumulators

1856

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1857

1858

int i = 0;

1859

for(; i <= (K - K0); i += K0)

1860

{

1861

// Supported cases (M0, K0):

1862

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1863

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1864

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1865

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1866

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1867

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1868

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1869

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1870

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1871

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1872

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1873

VEC_DATA_TYPE(DATA_TYPE, N0)

1874

b0;

1875

1876

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1877

VFMA_M0xN0(0, a, b0, c);

1878

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1879

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1880

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1881

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1882

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1883

#endif // K0 > 2

1884

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1885

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1886

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1887

#endif // K0 > 3

1888

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1889

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1890

VFMA_M0xN0(4, a, b0, c);

1891

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1892

VFMA_M0xN0(5, a, b0, c);

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(6, a, b0, c);

1895

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1896

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1897

#endif // K0 > 4

1898

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1899

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1900

VFMA_M0xN0(8, a, b0, c);

1901

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1902

VFMA_M0xN0(9, a, b0, c);

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(A, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(B, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(C, a, b0, c);

1909

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1910

VFMA_M0xN0(D, a, b0, c);

1911

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1912

VFMA_M0xN0(E, a, b0, c);

1913

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1914

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1915

#endif // K0 > 8

1916

1917

lhs_offset += K0 * sizeof(DATA_TYPE);

1918

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1919

}

1920

1921

// Left-over accumulations

1922

for(; i < K; ++i)

1923

{

1924

// Load values from LHS matrix

1925

VEC_DATA_TYPE(DATA_TYPE, 2)

1926

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1927

#if M0 > 1

1928

VEC_DATA_TYPE(DATA_TYPE, 2)

1929

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1930

#endif // M0 > 1

1931

#if M0 > 2

1932

VEC_DATA_TYPE(DATA_TYPE, 2)

1933

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1934

#endif // M0 > 2

1935

#if M0 > 3

1936

VEC_DATA_TYPE(DATA_TYPE, 2)

1937

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1938

#endif // M0 > 3

1939

#if M0 > 4

1940

VEC_DATA_TYPE(DATA_TYPE, 2)

1941

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1942

#endif // M0 > 4

1943

#if M0 > 5

1944

VEC_DATA_TYPE(DATA_TYPE, 2)

1945

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1946

#endif // M0 > 5

1947

#if M0 > 6

1948

VEC_DATA_TYPE(DATA_TYPE, 2)

1949

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1950

#endif // M0 > 6

1951

#if M0 > 7

1952

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1953

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1954

#endif // M0 > 7

1955

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1956

VEC_DATA_TYPE(DATA_TYPE, N0)

1957

b0;

1958

1959

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1960

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1961

1962

lhs_offset += sizeof(DATA_TYPE);

1963

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1964

}

1965

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1966

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1967

1968

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1969

1970

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1971

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1972

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1973

1974

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1975

// multiply dst_stride_z by DEPTH_GEMM3D

1976

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1977

1978

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1979

1980

// Add offset for batched GEMM

1981

dst_addr += z * dst_stride_z;

1982

1983

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1984

1985

// Multiply by the weight of matrix-matrix product and store the result

1986

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1987

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1988

#endif // defined(ALPHA)

1989

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1990

// Add beta*bias

1991

#if defined(BETA)

1992

#if defined(BROADCAST_BIAS)

1993

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1994

1995

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1996

1997

#ifndef UNIT_BETA

1998

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1999

#endif // UNIT_BIAS

2000

2001

// c = c + bias[broadcasted]

2002

ADD_BLOCK_BROADCAST(M0, c, bias0);

2003

2004

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2005

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

2006

2007

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2008

2009

#ifndef UNIT_BETA

2010

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2015

2016

#endif // defined(BROADCAST_BIAS)

2017

#endif // defined(BETA)

2018

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2019

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2020

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2021

#endif // defined(ACTIVATION_TYPE)

2022

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2023

const bool cond_y = y == 0;

2024

const bool cond_x = ((x + 1) * N0 >= N);

2025

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2026

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

2027

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2028

2029

#undef RHS_BLOCK_SIZE

2030

#undef RHS_OFFSET_X

2031

#undef RHS_STEP_X

2032

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2033

2034

#if defined(OPENCL_IMAGE_SUPPORT)

2035

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2036

* The LHS matrix is NOT reshaped

2037

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2038

*

2039

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2040

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2041

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2042

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2043

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2044

* could be different from the value returned by get_image_height(rhs_img).

2045

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2046

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2047

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2048

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2049

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2050

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2051

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2052

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2058

* The activation function is performed after the bias addition

2059

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2060

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2061

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2062

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2063

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2064

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2065

*

2066

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2067

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2068

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2069

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2070

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2071

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2072

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2073

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2074

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2075

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2076

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2077

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2078

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2079

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2080

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2081

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2082

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2083

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2084

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2085

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2086

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2087

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2088

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2089

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2090

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2091

*/

2092

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2093

__read_only image2d_t rhs_img,

2094

#if defined(BETA)

2095

IMAGE_DECLARATION(bias),

2096

#endif // defined(BETA)

2097

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2103

uint dst_stride_z

2104

#if defined(REINTERPRET_INPUT_AS_3D)

2105

,

2106

uint lhs_cross_plane_pad

2107

#endif // REINTERPRET_INPUT_AS_3D

2108

#if defined(REINTERPRET_OUTPUT_AS_3D)

2109

,

2110

uint dst_cross_plane_pad

2111

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2116

2117

// Block size

2118

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2119

2120

// RHS offset and step X

2121

#if defined(RHS_INTERLEAVE)

2122

#define RHS_OFFSET_X (PIXEL_UNIT)

2123

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2124

#else // defined(RHS_INTERLEAVE)

2125

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2126

#define RHS_STEP_X (PIXEL_UNIT)

2127

#endif // defined(RHS_INTERLEAVE)

2128

2129

uint x = get_global_id(0);

2130

uint y = get_global_id(1);

2131

uint z = get_global_id(2);

2132

2133

#if defined(DUMMY_WORK_ITEMS)

2134

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2139

2140

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2141

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2142

2143

#if defined(MATRIX_B_DEPTH)

2144

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2145

const uint z_rhs = (z % MATRIX_B_DEPTH);

2146

#else // defined(MATRIX_B_DEPTH)

2147

const uint z_rhs = z;

2148

#endif // defined(MATRIX_B_DEPTH)

2149

2150

// Compute RHS matrix coordinates

2151

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2152

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2153

2154

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2155

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2156

2157

#if defined(REINTERPRET_INPUT_AS_3D)

2158

2159

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2160

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2161

2162

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2163

// multiply lhs_stride_z by DEPTH_GEMM3D

2164

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2165

2166

#else // defined(REINTERPRET_INPUT_AS_3D)

2167

2168

// Add offset for batched GEMM

2169

lhs_offset += z * lhs_stride_z;

2170

2171

#endif // defined(REINTERPRET_INPUT_AS_3D)

2172

2173

// Initialize the accumulators

2174

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2175

2176

int i = 0;

2177

for(; i <= (K - K0); i += K0)

2178

{

2179

// Load values from LHS matrix

2180

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2181

2182

VEC_DATA_TYPE(DATA_TYPE, N0)

2183

b0;

2184

2185

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2186

VFMA_M0xN0(0, a, b0, c);

2187

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2188

VFMA_M0xN0(1, a, b0, c);

2189

#if K0 > 2

2190

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2191

VFMA_M0xN0(2, a, b0, c);

2192

#endif // K0 > 2

2193

#if K0 > 3

2194

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2195

VFMA_M0xN0(3, a, b0, c);

2196

#endif // K0 > 3

2197

#if K0 > 4

2198

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2199

VFMA_M0xN0(4, a, b0, c);

2200

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2201

VFMA_M0xN0(5, a, b0, c);

2202

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2203

VFMA_M0xN0(6, a, b0, c);

2204

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2205

VFMA_M0xN0(7, a, b0, c);

2206

#endif // K0 > 4

2207

#if K0 > 8

2208

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2209

VFMA_M0xN0(8, a, b0, c);

2210

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2211

VFMA_M0xN0(9, a, b0, c);

2212

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2213

VFMA_M0xN0(A, a, b0, c);

2214

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2215

VFMA_M0xN0(B, a, b0, c);

2216

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2217

VFMA_M0xN0(C, a, b0, c);

2218

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2219

VFMA_M0xN0(D, a, b0, c);

2220

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2221

VFMA_M0xN0(E, a, b0, c);

2222

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2223

VFMA_M0xN0(F, a, b0, c);

2224

#endif // K0 > 8

2225

2226

lhs_offset += K0 * sizeof(DATA_TYPE);

2227

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2228

}

2229

2230

// Left-over accumulations

2231

for(; i < K; ++i)

2232

{

2233

// Load values from LHS matrix

2234

VEC_DATA_TYPE(DATA_TYPE, 2)

2235

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2236

#if M0 > 1

2237

VEC_DATA_TYPE(DATA_TYPE, 2)

2238

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2239

#endif // M0 > 1

2240

#if M0 > 2

2241

VEC_DATA_TYPE(DATA_TYPE, 2)

2242

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2243

#endif // M0 > 2

2244

#if M0 > 3

2245

VEC_DATA_TYPE(DATA_TYPE, 2)

2246

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2247

#endif // M0 > 3

2248

#if M0 > 4

2249

VEC_DATA_TYPE(DATA_TYPE, 2)

2250

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2251

#endif // M0 > 4

2252

#if M0 > 5

2253

VEC_DATA_TYPE(DATA_TYPE, 2)

2254

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2255

#endif // M0 > 5

2256

#if M0 > 6

2257

VEC_DATA_TYPE(DATA_TYPE, 2)

2258

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2259

#endif // M0 > 6

2260

#if M0 > 7

2261

VEC_DATA_TYPE(DATA_TYPE, 2)

2262

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2263

#endif // M0 > 7

2264

2265

VEC_DATA_TYPE(DATA_TYPE, N0)

2266

b0;

2267

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2268

2269

VFMA_M0xN0(0, a, b0, c);

2270

2271

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2275

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2276

2277

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2278

2279

#if defined(REINTERPRET_OUTPUT_AS_3D)

2280

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2281

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2282

2283

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2284

// multiply dst_stride_z by DEPTH_GEMM3D

2285

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2286

2287

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2288

2289

// Add offset for batched GEMM

2290

dst_addr += z * dst_stride_z;

2291

2292

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2293

2294

// Multiply by the weight of matrix-matrix product and store the result

2295

#if defined(ALPHA)

2296

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2297

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2302

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2303

2304

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2305

2306

#ifndef UNIT_BETA

2307

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2308

#endif // UNIT_BIAS

2309

2310

// c = c + bias[broadcasted]

2311

ADD_BLOCK_BROADCAST(M0, c, bias0);

2312

2313

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2314

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2315

2316

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2317

2318

#ifndef UNIT_BETA

2319

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2324

2325

#endif // defined(BROADCAST_BIAS)

2326

#endif // defined(BETA)

2327

2328

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2329

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2330

#endif // defined(ACTIVATION_TYPE)

2331

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2332

const bool cond_y = y == 0;

2333

const bool cond_x = ((x + 1) * N0 >= N);

2334

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2335

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

2336

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2337

2338

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2343

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2344

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2345

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2346

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2347

#if defined(MIXED_PRECISION)

2348

#if K0 == 2

2349

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2355

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2362

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2370

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2382

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2402

#error "K0 value not supported"

2403

#endif // K0 conditions

2404

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2405

#if K0 == 2

2406

#define ARM_DOT_K0(a, b, c) \

2407

({ \

2408

c = fma(a.s0, b.s0, c); \

2409

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2410

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2411

#elif K0 == 3 // K0 == 3

2412

#define ARM_DOT_K0(a, b, c) \

2413

({ \

2414

c = fma(a.s0, b.s0, c); \

2415

c = fma(a.s1, b.s1, c); \

2416

c = fma(a.s2, b.s2, c); \

2417

})

2418

#elif K0 == 4 // K0 == 4

2419

#define ARM_DOT_K0(a, b, c) \

2420

({ \

2421

c = fma(a.s0, b.s0, c); \

2422

c = fma(a.s1, b.s1, c); \

2423

c = fma(a.s2, b.s2, c); \

2424

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2425

})

2426

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2427

#define ARM_DOT_K0(a, b, c) \

2428

({ \

2429

c = fma(a.s0, b.s0, c); \

2430

c = fma(a.s1, b.s1, c); \

2431

c = fma(a.s2, b.s2, c); \

2432

c = fma(a.s3, b.s3, c); \

2433

c = fma(a.s4, b.s4, c); \

2434

c = fma(a.s5, b.s5, c); \

2435

c = fma(a.s6, b.s6, c); \

2436

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2437

})

2438

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2439

#define ARM_DOT_K0(a, b, c) \

2440

({ \

2441

c = fma(a.s0, b.s0, c); \

2442

c = fma(a.s1, b.s1, c); \

2443

c = fma(a.s2, b.s2, c); \

2444

c = fma(a.s3, b.s3, c); \

2445

c = fma(a.s4, b.s4, c); \

2446

c = fma(a.s5, b.s5, c); \

2447

c = fma(a.s6, b.s6, c); \

2448

c = fma(a.s7, b.s7, c); \

2449

c = fma(a.s8, b.s8, c); \

2450

c = fma(a.s9, b.s9, c); \

2451

c = fma(a.sA, b.sA, c); \

2452

c = fma(a.sB, b.sB, c); \

2453

c = fma(a.sC, b.sC, c); \

2454

c = fma(a.sD, b.sD, c); \

2455

c = fma(a.sE, b.sE, c); \

2456

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2457

})

2458

#else // K0 not supported

2459

#error "K0 value not supported"

2460

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2461

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2462

2463

#if N0 == 2

2464

#define ARM_DOT_K0XN0(a, b, c) \

2465

({ \

2466

ARM_DOT_K0((a), (b##0), (c.s0)); \

2467

ARM_DOT_K0((a), (b##1), (c.s1)); \

2468

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2469

#elif N0 == 3 // N0 == 3

2470

#define ARM_DOT_K0XN0(a, b, c) \

2471

({ \

2472

ARM_DOT_K0((a), (b##0), (c.s0)); \

2473

ARM_DOT_K0((a), (b##1), (c.s1)); \

2474

ARM_DOT_K0((a), (b##2), (c.s2)); \

2475

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2476

#elif N0 == 4 // N0 == 4

2477

#define ARM_DOT_K0XN0(a, b, c) \

2478

({ \

2479

ARM_DOT_K0((a), (b##0), (c.s0)); \

2480

ARM_DOT_K0((a), (b##1), (c.s1)); \

2481

ARM_DOT_K0((a), (b##2), (c.s2)); \

2482

ARM_DOT_K0((a), (b##3), (c.s3)); \

2483

})

2484

#elif N0 == 8 // N0 == 8

2485

#define ARM_DOT_K0XN0(a, b, c) \

2486

({ \

2487

ARM_DOT_K0((a), (b##0), (c.s0)); \

2488

ARM_DOT_K0((a), (b##1), (c.s1)); \

2489

ARM_DOT_K0((a), (b##2), (c.s2)); \

2490

ARM_DOT_K0((a), (b##3), (c.s3)); \

2491

ARM_DOT_K0((a), (b##4), (c.s4)); \

2492

ARM_DOT_K0((a), (b##5), (c.s5)); \

2493

ARM_DOT_K0((a), (b##6), (c.s6)); \

2494

ARM_DOT_K0((a), (b##7), (c.s7)); \

2495

})

2496

#elif N0 == 16 // N0 == 16

2497

#define ARM_DOT_K0XN0(a, b, c) \

2498

({ \

2499

ARM_DOT_K0((a), (b##0), (c.s0)); \

2500

ARM_DOT_K0((a), (b##1), (c.s1)); \

2501

ARM_DOT_K0((a), (b##2), (c.s2)); \

2502

ARM_DOT_K0((a), (b##3), (c.s3)); \

2503

ARM_DOT_K0((a), (b##4), (c.s4)); \

2504

ARM_DOT_K0((a), (b##5), (c.s5)); \

2505

ARM_DOT_K0((a), (b##6), (c.s6)); \

2506

ARM_DOT_K0((a), (b##7), (c.s7)); \

2507

ARM_DOT_K0((a), (b##8), (c.s8)); \

2508

ARM_DOT_K0((a), (b##9), (c.s9)); \

2509

ARM_DOT_K0((a), (b##A), (c.sA)); \

2510

ARM_DOT_K0((a), (b##B), (c.sB)); \

2511

ARM_DOT_K0((a), (b##C), (c.sC)); \

2512

ARM_DOT_K0((a), (b##D), (c.sD)); \

2513

ARM_DOT_K0((a), (b##E), (c.sE)); \

2514

ARM_DOT_K0((a), (b##F), (c.sF)); \

2515

})

2516

#else // N0 not supported

2517

#error "N0 value not supported"

2518

#endif // N0 conditions

2519

2520

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2521

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2522

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2523

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2524

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2525

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2526

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2527

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2528

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2529

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2530

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2531

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2532

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2533

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2534

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2535

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2536

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2537

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2538

* - N0 = 2, 3, 4, 8, 16

2539

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2540

* - V0 >= 1

2541

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2542

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2543

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2544

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2545

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2546

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2547

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2548

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2549

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2550

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2551

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2552

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2553

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2554

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2555

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2556

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2557

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2558

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2559

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2560

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2561

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2562

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2563

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2564

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2565

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2566

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2567

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2568

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2569

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2574

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2575

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2576

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2577

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2578

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2579

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2580

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2581

*/

2582

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2583

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2584

#if defined(BETA)

2585

IMAGE_DECLARATION(bias),

2586

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2587

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2588

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2589

uint lhs_stride_z,

2590

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2591

#if defined(BETA)

2592

uint bias_stride_z,

2593

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2594

uint dst_stride_z

2595

#if defined(REINTERPRET_OUTPUT_AS_3D)

2596

,

2597

uint dst_cross_plane_pad

2598

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2603

2604

#if defined(LHS_INTERLEAVE)

2605

#define LHS_OFFSET_X (K0)

2606

#define LHS_STEP_X ((K0) * (V0))

2607

#define LHS_STEP_LOOP (1)

2608

#else // defined(INTERLEAVE)

2609

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2610

#define LHS_STEP_X (K0)

2611

#define LHS_STEP_LOOP (V0)

2612

#endif // defined(INTERLEAVE)

2613

2614

// Block size

2615

#define RHS_BLOCK_SIZE ((K0) * (N0))

2616

2617

// RHS offset and step X

2618

#if defined(RHS_INTERLEAVE)

2619

#define RHS_OFFSET_X (K0)

2620

#define RHS_STEP_X ((K0) * (H0))

2621

#define RHS_STEP_LOOP (1)

2622

#else // defined(RHS_INTERLEAVE)

2623

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2624

#define RHS_STEP_X (K0)

2625

#define RHS_STEP_LOOP (H0)

2626

#endif // defined(RHS_INTERLEAVE)

2627

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2628

#if defined(DUMMY_WORK_ITEMS)

2629

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2634

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2635

// Compute LHS matrix address

2636

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2637

(get_global_id(2) * lhs_stride_z);

2638

2639

// Compute RHS matrix address

2640

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2641

2642

#if defined(MATRIX_B_DEPTH)

2643

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2644

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2645

#else // defined(MATRIX_B_DEPTH)

2646

rhs_addr += get_global_id(2) * rhs_stride_z;

2647

#endif // defined(MATRIX_B_DEPTH)

2648

2649

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2650

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2651

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2652

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2653

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2654

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2655

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2656

{

2657

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2658

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2659

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2660

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2661

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2662

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2663

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2664

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2665

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2666

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2667

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2668

2669

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2670

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2671

2672

// Accumulate

2673

ARM_DOT_K0XN0(a0, b, c0);

2674

#if M0 > 1

2675

ARM_DOT_K0XN0(a1, b, c1);

2676

#endif // M0 > 1

2677

#if M0 > 2

2678

ARM_DOT_K0XN0(a2, b, c2);

2679

#endif // M0 > 2

2680

#if M0 > 3

2681

ARM_DOT_K0XN0(a3, b, c3);

2682

#endif // M0 > 3

2683

#if M0 > 4

2684

ARM_DOT_K0XN0(a4, b, c4);

2685

#endif // M0 > 4

2686

#if M0 > 5

2687

ARM_DOT_K0XN0(a5, b, c5);

2688

#endif // M0 > 5

2689

#if M0 > 6

2690

ARM_DOT_K0XN0(a6, b, c6);

2691

#endif // M0 > 6

2692

#if M0 > 7

2693

ARM_DOT_K0XN0(a7, b, c7);

2694

#endif // M0 > 7

2695

2696

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2697

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2698

}

2699

2700

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2701

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2702

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2703

2704

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2705

2706

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2707

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2708

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2709

// multiply dst_stride_z by DEPTH_GEMM3D

2710

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2711

2712

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2713

2714

// Add offset for batched GEMM

2715

dst_addr += get_global_id(2) * dst_stride_z;

2716

2717

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2718

2719

// Multiply by the weight of matrix-matrix product and store the result

2720

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2721

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2722

#endif // defined(ALPHA)

2723

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2724

// Add beta*bias

2725

#if defined(BETA)

2726

#if defined(BROADCAST_BIAS)

2727

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2728

2729

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2730

2731

#ifndef UNIT_BETA

2732

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2733

#endif // UNIT_BIAS

2734

2735

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2736

#if defined(MIXED_PRECISION)

2737

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2738

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2739

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2740

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2741

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2742

2743

#else // defined(BROADCAST_BIAS)

2744

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2745

2) * bias_stride_z;

2746

2747

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2748

2749

#ifndef UNIT_BETA

2750

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2751

#endif // UNIT_BIAS

2752

2753

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2754

#if defined(MIXED_PRECISION)

2755

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2756

ADD_BLOCK(M0, c, bias_hp);

2757

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2758

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2759

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2760

2761

#endif // defined(BROADCAST_BIAS)

2762

#endif // defined(BETA)

2763

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2764

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2765

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2766

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2767

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

2768

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2769

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2770

#endif // defined(ACTIVATION_TYPE)

2771

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2772

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

2773

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

2774

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2775

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2776

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2777

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

2778

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2779

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

2780

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2781

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2782

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2783

#undef LHS_BLOCK_SIZE

2784

#undef LHS_OFFSET_X

2785

#undef LHS_STEP_X

2786

#undef RHS_BLOCK_SIZE

2787

#undef RHS_OFFSET_X

2788

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2789

#undef LHS_STEP_LOOP

2790

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2791

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2792

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2793

#if defined(OPENCL_IMAGE_SUPPORT)

2794

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2795

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2796

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2797

*

2798

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2799

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2800

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2801

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2802

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2803

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2804

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2805

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2806

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2807

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2808

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2809

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2810

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2811

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2812

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2813

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2814

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2815

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2822

* The activation function is performed after the bias addition

2823

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2824

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2825

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2826

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2827

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2828

*

2829

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2830

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2831

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2832

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2833

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2834

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2835

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2836

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2837

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2838

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2839

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2840

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2841

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2842

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2843

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2844

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2845

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2846

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2847

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2848

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2849

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2850

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2851

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2852

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2853

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2854

*/

2855

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2856

__read_only image2d_t rhs_img,

2857

#if defined(BETA)

2858

IMAGE_DECLARATION(bias),

2859

#endif // defined(BETA)

2860

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2861

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2867

uint dst_stride_z

2868

#if defined(REINTERPRET_OUTPUT_AS_3D)

2869

,

2870

uint dst_cross_plane_pad

2871

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2876

2877

// Block size

2878

#define LHS_BLOCK_SIZE ((K0) * (M0))

2879

2880

#if defined(LHS_INTERLEAVE)

2881

#define LHS_OFFSET_X (K0)

2882

#define LHS_STEP_X ((K0) * (V0))

2883

#define LHS_STEP_LOOP (1)

2884

#else // defined(INTERLEAVE)

2885

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2886

#define LHS_STEP_X (K0)

2887

#define LHS_STEP_LOOP (V0)

2888

#endif // defined(INTERLEAVE)

2889

2890

// Block size

2891

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2892

2893

// RHS offset and step X

2894

#if defined(RHS_INTERLEAVE)

2895

#define RHS_OFFSET_X (PIXEL_UNIT)

2896

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2897

#define RHS_STEP_LOOP (1)

2898

#else // defined(RHS_INTERLEAVE)

2899

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2900

#define RHS_STEP_X PIXEL_UNIT

2901

#define RHS_STEP_LOOP (H0)

2902

#endif // defined(RHS_INTERLEAVE)

2903

2904

#if defined(DUMMY_WORK_ITEMS)

2905

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2910

2911

// Compute LHS matrix address

2912

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2913

(get_global_id(2) * lhs_stride_z);

2914

2915

#if defined(MATRIX_B_DEPTH)

2916

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2917

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2918

#else // defined(MATRIX_B_DEPTH)

2919

const uint z_rhs = get_global_id(2);

2920

#endif // defined(MATRIX_B_DEPTH)

2921

2922

// Compute RHS matrix coordinates

2923

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2924

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2925

2926

// Initialize the accumulators

2927

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2928

2929

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2930

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2931

2932

for(int i = 0; i < K; i += K0)

2933

{

2934

// Load values from LHS matrix

2935

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2936

2937

// Load values from RHS matrix stored in a cl_image

2938

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2939

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2940

2941

// Accumulate

2942

ARM_DOT_K0XN0(a0, b, c0);

2943

#if M0 > 1

2944

ARM_DOT_K0XN0(a1, b, c1);

2945

#endif // M0 > 1

2946

#if M0 > 2

2947

ARM_DOT_K0XN0(a2, b, c2);

2948

#endif // M0 > 2

2949

#if M0 > 3

2950

ARM_DOT_K0XN0(a3, b, c3);

2951

#endif // M0 > 3

2952

#if M0 > 4

2953

ARM_DOT_K0XN0(a4, b, c4);

2954

#endif // M0 > 4

2955

#if M0 > 5

2956

ARM_DOT_K0XN0(a5, b, c5);

2957

#endif // M0 > 5

2958

#if M0 > 6

2959

ARM_DOT_K0XN0(a6, b, c6);

2960

#endif // M0 > 6

2961

#if M0 > 7

2962

ARM_DOT_K0XN0(a7, b, c7);

2963

#endif // M0 > 7

2964

2965

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2966

2967

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2968

}

2969

2970

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2971

2972

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2973

2974

#if defined(REINTERPRET_OUTPUT_AS_3D)

2975

2976

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2977

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2978

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2979

// multiply dst_stride_z by DEPTH_GEMM3D

2980

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2981

2982

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2983

2984

// Add offset for batched GEMM

2985

dst_addr += get_global_id(2) * dst_stride_z;

2986

2987

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2988

2989

// Multiply by the weight of matrix-matrix product and store the result

2990

#if defined(ALPHA)

2991

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2992

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2997

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2998

2999

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3000

3001

#ifndef UNIT_BETA

3002

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3003

#endif // UNIT_BIAS

3004

3005

// c = c + bias[broadcasted]

3006

#if defined(MIXED_PRECISION)

3007

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3008

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3009

#else // defined(MIXED_PRECISION)

3010

ADD_BLOCK_BROADCAST(M0, c, bias0);

3011

#endif // defined(MIXED_PRECISION)

3012

3013

#else // defined(BROADCAST_BIAS)

3014

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3015

2) * bias_stride_z;

3016

3017

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3018

3019

#ifndef UNIT_BETA

3020

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3025

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3026

ADD_BLOCK(M0, c, bias_hp);

3027

#else // defined(MIXED_PRECISION)

3028

ADD_BLOCK(M0, c, bias);

3029

#endif // defined(MIXED_PRECISION)

3030

3031

#endif // defined(BROADCAST_BIAS)

3032

#endif // defined(BETA)

3033

3034

#if defined(ACTIVATION_TYPE)

3035

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3036

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3037

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3038

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3039

#endif // defined(MIXED_PRECISION)

3040

#endif // defined(ACTIVATION_TYPE)

3041

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3042

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3043

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3044

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3045

// Store output block

3046

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3047

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3048

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3049

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3050

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3051

#endif // defined(MIXED_PRECISION)

3052

3053

#undef LHS_BLOCK_SIZE

3054

#undef LHS_OFFSET_X

3055

#undef LHS_STEP_X

3056

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3064

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3065

#if defined(LHS_TRANSPOSE)

3066

3067

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3068

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3069

#if defined(MIXED_PRECISION)

3070

3071

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3072

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3073

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3074

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3075

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3076

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3077

#else // defined(MIXED_PRECISION

3078

3079

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3080

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3081

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3082

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3083

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3084

3085

#endif // defined(MIXED_PRECISION)

3086

3087

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3088

({ \

3089

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3090

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3091

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3092

({ \

3093

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3094

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3095

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3096

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3097

({ \

3098

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3099

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3100

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3101

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3102

({ \

3103

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3104

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3105

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3106

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3107

({ \

3108

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3109

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3110

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3111

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3112

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3113

})

3114

3115

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3116

// a is the column-vector (transposed)

3117

// b is the row-vector (not transposed)

3118

// C is the output matrix

3119

// Lower case is a vector (a, b)

3120

// Upper case is a matrix (C)

3121

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3122

3123

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3124

({ \

3125

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3126

})

3127

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3128

({ \

3129

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3130

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3131

})

3132

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3133

({ \

3134

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3135

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3136

})

3137

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3138

({ \

3139

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3140

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3141

})

3142

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3143

({ \

3144

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3145

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3146

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3147

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3148

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3149

})

3150

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3151

({ \

3152

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3153

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3154

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3155

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3156

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3157

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3158

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3159

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3160

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3161

})

3162

3163

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3164

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3165

// The dimensions supported are:

3166

// M0: 1, 2, 3, 4, 8

3167

// N0: 1, 2, 3, 4, 8, 16

3168

// K0: 1, 2, 3, 4, 8, 16

3169

// This macro calls the vector-by-matrix macro K0 times

3170

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3171

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3172

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3173

(M0, N0, TYPE, A, B, C)

3174

3175

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3176

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3177

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3178

*

3179

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3180

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3181

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3182

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3183

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3184

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3185

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3186

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3187

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3188

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3189

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3190

* - M0 = 2, 3, 4, 8

3191

* - N0 = 2, 3, 4, 8, 16

3192

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3197

* The activation function is performed after the bias addition

3198

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3199

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3200

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3201

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3202

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3203

*

3204

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3205

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3206

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3207

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3208

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3209

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3210

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3211

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3212

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3213

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3214

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3215

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3216

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3217

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3218

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3219

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3220

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3221

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3222

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3223

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3224

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3225

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3226

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3228

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3229

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3230

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3231

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3232

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3233

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3234

*/

3235

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3236

IMAGE_DECLARATION(rhs),

3237

#if defined(BETA)

3238

IMAGE_DECLARATION(bias),

3239

#endif // defined(BETA)

3240

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3241

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3247

uint dst_stride_z

3248

#if defined(REINTERPRET_OUTPUT_AS_3D)

3249

,

3250

uint dst_cross_plane_pad

3251

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3256

3257

#if defined(LHS_INTERLEAVE)

3258

#define LHS_OFFSET_X (M0)

3259

#define LHS_STEP_X ((M0) * (V0))

3260

#define LHS_STEP_LOOP (1)

3261

#else // defined(INTERLEAVE)

3262

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3263

#define LHS_STEP_X (M0)

3264

#define LHS_STEP_LOOP (V0)

3265

#endif // defined(INTERLEAVE)

3266

3267

// Block size

3268

#define RHS_BLOCK_SIZE ((K0) * (N0))

3269

3270

// RHS offset and step X

3271

#if defined(RHS_INTERLEAVE)

3272

#define RHS_OFFSET_X (N0)

3273

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3274

#else // defined(RHS_INTERLEAVE)

3275

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3276

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3277

#endif // defined(RHS_INTERLEAVE)

3278

3279

const uint x = get_global_id(0);

3280

const uint y = get_global_id(1);

3281

const uint z = get_global_id(2);

3282

3283

#if defined(DUMMY_WORK_ITEMS)

3284

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3289

3290

// Compute LHS matrix address

3291

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3292

3293

// Compute RHS matrix address

3294

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3295

3296

#if defined(MATRIX_B_DEPTH)

3297

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3298

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3299

#else // defined(MATRIX_B_DEPTH)

3300

rhs_addr += z * rhs_stride_z;

3301

#endif // defined(MATRIX_B_DEPTH)

3302

3303

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3304

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3305

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3306

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3307

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3308

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3309

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3310

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3311

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3312

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3313

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3314

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3315

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3316

b0;

3317

3318

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3319

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3320

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3321

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3322

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3323

lhs += LHS_STEP_X;

3324

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3325

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3326

#if K0 > 1

3327

a0 = VLOAD(M0)(0, lhs);

3328

b0 = VLOAD(N0)(0, rhs);

3329

3330

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3338

b0 = VLOAD(N0)(0, rhs);

3339

3340

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3348

b0 = VLOAD(N0)(0, rhs);

3349

3350

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3358

b0 = VLOAD(N0)(0, rhs);

3359

3360

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3366

b0 = VLOAD(N0)(0, rhs);

3367

3368

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3374

b0 = VLOAD(N0)(0, rhs);

3375

3376

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3382

b0 = VLOAD(N0)(0, rhs);

3383

3384

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3392

b0 = VLOAD(N0)(0, rhs);

3393

3394

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3400

b0 = VLOAD(N0)(0, rhs);

3401

3402

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3408

b0 = VLOAD(N0)(0, rhs);

3409

3410

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3416

b0 = VLOAD(N0)(0, rhs);

3417

3418

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3424

b0 = VLOAD(N0)(0, rhs);

3425

3426

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3432

b0 = VLOAD(N0)(0, rhs);

3433

3434

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3440

b0 = VLOAD(N0)(0, rhs);

3441

3442

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3448

b0 = VLOAD(N0)(0, rhs);

3449

3450

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3457

lhs += (M0 * K0 * (V0 - 1));

3458

#endif // LHS_INTERLEAVE

3459

3460

#ifndef RHS_INTERLEAVE

3461

rhs += (N0 * K0 * (H0 - 1));

3462

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3463

}

3464

3465

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3466

3467

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3468

3469

#if defined(REINTERPRET_OUTPUT_AS_3D)

3470

3471

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3472

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3473

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3474

// multiply dst_stride_z by DEPTH_GEMM3D

3475

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3476

3477

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3478

3479

// Add offset for batched GEMM

3480

dst_addr += z * dst_stride_z;

3481

3482

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3483

3484

// Multiply by the weight of matrix-matrix product and store the result

3485

#if defined(ALPHA)

3486

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3487

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3492

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3493

3494

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3495

3496

#ifndef UNIT_BETA

3497

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3498

#endif // UNIT_BIAS

3499

3500

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3501

#if defined(MIXED_PRECISION)

3502

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3503

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3504

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3505

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3506

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3507

3508

#else // defined(BROADCAST_BIAS)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3509

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

3510

2) * bias_stride_z;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3511

3512

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3513

3514

#ifndef UNIT_BETA

3515

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3516

#endif // UNIT_BIAS

3517

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3518

#if defined(MIXED_PRECISION)

3519

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3520

ADD_BLOCK(M0, c, bias_hp);

3521

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3522

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3523

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3524

3525

#endif // defined(BROADCAST_BIAS)

3526

#endif // defined(BETA)

3527

3528

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3529

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3530

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3531

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3532

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3533

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3534

#endif // defined(ACTIVATION_TYPE)

3535

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3536

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3537

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3538

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3539

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3540

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3541

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3542

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3543

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3544

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3545

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3546

3547

#undef LHS_BLOCK_SIZE

3548

#undef LHS_OFFSET_X

3549

#undef LHS_STEP_X

3550

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3555

#if defined(OPENCL_IMAGE_SUPPORT)

3556

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3557

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3558

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3559

*

3560

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3561

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3562

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3563

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3564

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3565

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3566

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3567

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3568

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3569

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3570

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3571

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3572

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3573

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3574

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3582

* The activation function is performed after the bias addition

3583

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3584

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3585

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3586

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3587

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3588

*

3589

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3590

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3591

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3592

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3593

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3594

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3595

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3596

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3597

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3598

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3599

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3600

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3601

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3602

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3603

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3604

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3605

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3606

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3607

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3608

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3609

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3610

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3611

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3612

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3613

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3614

*/

3615

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3616

__read_only image2d_t rhs_img,

3617

#if defined(BETA)

3618

IMAGE_DECLARATION(bias),

3619

#endif // defined(BETA)

3620

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3621

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3627

uint dst_stride_z

3628

#if defined(REINTERPRET_OUTPUT_AS_3D)

3629

,

3630

uint dst_cross_plane_pad

3631

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3636

3637

// Block size

3638

#define LHS_BLOCK_SIZE ((K0) * (M0))

3639

3640

#if defined(LHS_INTERLEAVE)

3641

#define LHS_OFFSET_X (M0)

3642

#define LHS_STEP_X ((M0) * (V0))

3643

#define LHS_STEP_LOOP (1)

3644

#else // defined(INTERLEAVE)

3645

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3646

#define LHS_STEP_X (M0)

3647

#define LHS_STEP_LOOP (V0)

3648

#endif // defined(INTERLEAVE)

3649

3650

// Block size

3651

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3652

3653

// RHS offset and step X

3654

#if defined(RHS_INTERLEAVE)

3655

#define RHS_OFFSET_X (PIXEL_UNIT)

3656

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3657

#else // defined(RHS_INTERLEAVE)

3658

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3659

#define RHS_STEP_X (PIXEL_UNIT)

3660

#endif // defined(RHS_INTERLEAVE)

3661

3662

const uint x = get_global_id(0);

3663

const uint y = get_global_id(1);

3664

const uint z = get_global_id(2);

3665

3666

#if defined(DUMMY_WORK_ITEMS)

3667

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3672

3673

// Compute LHS matrix address

3674

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3675

3676

#if defined(MATRIX_B_DEPTH)

3677

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3678

const uint z_rhs = (z % MATRIX_B_DEPTH);

3679

#else // defined(MATRIX_B_DEPTH)

3680

const uint z_rhs = z;

3681

#endif // defined(MATRIX_B_DEPTH)

3682

3683

// Compute RHS matrix coordinates

3684

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3685

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3686

3687

// Initialize the accumulators

3688

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3689

3690

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3691

3692

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3693

3694

for(int i = 0; i < K; i += K0)

3695

{

3696

VEC_DATA_TYPE(DATA_TYPE, M0)

3697

a0;

3698

VEC_DATA_TYPE(DATA_TYPE, N0)

3699

b0;

3700

3701

a0 = VLOAD(M0)(0, lhs);

3702

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3703

3704

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3710

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3711

3712

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3719

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3720

3721

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3728

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3729

3730

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3737

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3738

3739

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3744

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3745

3746

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3751

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3752

3753

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3758

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3759

3760

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3767

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3768

3769

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3774

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3775

3776

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3781

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3782

3783

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3788

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3789

3790

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3795

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3796

3797

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3802

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3803

3804

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3809

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3810

3811

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3816

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3817

3818

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3824

lhs += (M0 * K0 * (V0 - 1));

3825

#endif // LHS_INTERLEAVE

3826

3827

x_rhs += K0 * RHS_STEP_X;

3828

#ifndef RHS_INTERLEAVE

3829

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3830

#endif // RHS_INTERLEAVE

3831

}

3832

3833

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3834

3835

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3836

3837

#if defined(REINTERPRET_OUTPUT_AS_3D)

3838

3839

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3840

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3841

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3842

// multiply dst_stride_z by DEPTH_GEMM3D

3843

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3844

3845

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3846

3847

// Add offset for batched GEMM

3848

dst_addr += z * dst_stride_z;

3849

3850

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3851

3852

// Multiply by the weight of matrix-matrix product and store the result

3853

#if defined(ALPHA)

3854

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3855

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3860

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3861

3862

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3863

3864

#ifndef UNIT_BETA

3865

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3866

#endif // UNIT_BIAS

3867

3868

// c = c + bias[broadcasted]

3869

#if defined(MIXED_PRECISION)

3870

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3871

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3872

#else // defined(MIXED_PRECISION)

3873

ADD_BLOCK_BROADCAST(M0, c, bias0);

3874

#endif // defined(MIXED_PRECISION)

3875

3876

#else // defined(BROADCAST_BIAS)

3877

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3878

3879

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3880

3881

#ifndef UNIT_BETA

3882

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3883

#endif // UNIT_BIAS

3884

3885

#if defined(MIXED_PRECISION)

3886

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3887

ADD_BLOCK(M0, c, bias_hp);

3888

#else // defined(MIXED_PRECISION)

3889

ADD_BLOCK(M0, c, bias);

3890

#endif // defined(MIXED_PRECISION)

3891

3892

#endif // defined(BROADCAST_BIAS)

3893

#endif // defined(BETA)

3894

3895

#if defined(ACTIVATION_TYPE)

3896

#if defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3897

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3898

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

3899

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3900

#endif // defined(MIXED_PRECISION)

3901

#endif // defined(ACTIVATION_TYPE)

3902

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3903

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3904

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3905

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3906

// Store output block

3907

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3908

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3909

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3910

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

3911

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3912

#endif // defined(MIXED_PRECISION)

3913

3914

#undef LHS_BLOCK_SIZE

3915

#undef LHS_OFFSET_X

3916

#undef LHS_STEP_X

3917

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3925

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3926

#endif // defined(LHS_TRANSPOSE)

3927

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3928

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3929

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3930

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3931

3932

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3939

({ \

3940

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3941

})

3942

#elif M0 == 2 // M0 == 2

3943

#define RHS_VFMA_M0xN0(i, a, b, c) \

3944

({ \

3945

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3946

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3947

})

3948

#elif M0 == 3 // M0 == 3

3949

#define RHS_VFMA_M0xN0(i, a, b, c) \

3950

({ \

3951

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3952

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3953

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3954

})

3955

#elif M0 == 4 // M0 == 4

3956

#define RHS_VFMA_M0xN0(i, a, b, c) \

3957

({ \

3958

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3959

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3960

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3961

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3962

})

3963

#elif M0 == 5 // M0 == 5

3964

#define RHS_VFMA_M0xN0(i, a, b, c) \

3965

({ \

3966

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3967

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3968

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3969

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3970

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3971

})

3972

#elif M0 == 6 // M0 == 6

3973

#define RHS_VFMA_M0xN0(i, a, b, c) \

3974

({ \

3975

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3976

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3977

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3978

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3979

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3980

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3981

})

3982

#elif M0 == 7 // M0 == 7

3983

#define RHS_VFMA_M0xN0(i, a, b, c) \

3984

({ \

3985

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3986

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3987

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3988

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3989

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3990

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3991

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3992

})

3993

#elif M0 == 8 // M0 == 8

3994

#define RHS_VFMA_M0xN0(i, a, b, c) \

3995

({ \

3996

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3997

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3998

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3999

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

4000

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

4001

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

4002

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

4003

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

4004

})

4005

#else // M0 not supported

4006

#error "M0 not supported"

4007

#endif // M0 not supported

4008

4009

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

4010

* The LHS matrix is NOT reshaped

4011

* The RHS matrix is NOT reshaped

4012

*

4013

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4014

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

4015

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

4016

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

4017

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

4018

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

4019

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

4020

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4021

* @note Only the following configurations of M0, N0 and K0 are currently supported:

4022

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

4023

* - N0 = 2, 3, 4, 8, 16

4024

* - K0 = 2, 3, 4, 8, 16

4025

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4026

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4027

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4028

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

4029

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

4030

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4031

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4032

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4033

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

4034

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4035

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

4036

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4037

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4038

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4039

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4040

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4041

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4042

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4043

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4044

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4045

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4046

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4047

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4048

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4049

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4050

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4051

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4052

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4053

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4054

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4055

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4056

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4057

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4058

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4059

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4060

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4061

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4062

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4063

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4064

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4065

*/

4066

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4067

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4068

#if defined(BETA)

4069

IMAGE_DECLARATION(bias),

4070

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4071

IMAGE_DECLARATION(dst),

4072

uint lhs_stride_z,

4073

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4074

#if defined(BETA)

4075

uint bias_stride_z,

4076

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4077

uint dst_stride_z

4078

#if defined(REINTERPRET_INPUT_AS_3D)

4079

,

4080

uint lhs_cross_plane_pad

4081

#endif // REINTERPRET_INPUT_AS_3D

4082

#if defined(REINTERPRET_OUTPUT_AS_3D)

4083

,

4084

uint dst_cross_plane_pad

4085

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4090

4091

// RHS offset and step X

4092

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4093

4094

uint x = get_global_id(0);

4095

uint y = get_global_id(1);

4096

uint z = get_global_id(2);

4097

4098

#if defined(DUMMY_WORK_ITEMS)

4099

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4104

4105

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4106

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4107

4108

// Compute RHS matrix address

4109

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4110

4111

#if defined(MATRIX_B_DEPTH)

4112

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4113

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4114

#else // defined(MATRIX_B_DEPTH)

4115

rhs_offset += z * rhs_stride_z;

4116

#endif // defined(MATRIX_B_DEPTH)

4117

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4118

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4119

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4120

4121

#if defined(REINTERPRET_INPUT_AS_3D)

4122

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4123

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

4124

4125

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4126

// multiply lhs_stride_z by DEPTH_GEMM3D

4127

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4128

4129

#else // defined(REINTERPRET_INPUT_AS_3D)

4130

4131

// Add offset for batched GEMM

4132

lhs_offset += z * lhs_stride_z;

4133

4134

#endif // defined(REINTERPRET_INPUT_AS_3D)

4135

4136

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4137

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4138

4139

int i = 0;

4140

for(; i <= (K - K0); i += K0)

4141

{

4142

// Supported cases (M0, K0):

4143

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4144

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4145

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4146

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4147

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4148

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4149

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4150

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4151

// Load values from LHS matrix

4152

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4153

4154

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4155

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4156

4157

RHS_VFMA_M0xN0(0, a, b0, c);

4158

RHS_VFMA_M0xN0(1, a, b1, c);

4159

#if K0 > 2

4160

RHS_VFMA_M0xN0(2, a, b2, c);

4161

#endif // K0 > 2

4162

#if K0 > 3

4163

RHS_VFMA_M0xN0(3, a, b3, c);

4164

#endif // K0 > 3

4165

#if K0 > 4

4166

RHS_VFMA_M0xN0(4, a, b4, c);

4167

RHS_VFMA_M0xN0(5, a, b5, c);

4168

RHS_VFMA_M0xN0(6, a, b6, c);

4169

RHS_VFMA_M0xN0(7, a, b7, c);

4170

#endif // K0 > 4

4171

#if K0 > 8

4172

RHS_VFMA_M0xN0(8, a, b8, c);

4173

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4174

RHS_VFMA_M0xN0(A, a, bA, c);

4175

RHS_VFMA_M0xN0(B, a, bB, c);

4176

RHS_VFMA_M0xN0(C, a, bC, c);

4177

RHS_VFMA_M0xN0(D, a, bD, c);

4178

RHS_VFMA_M0xN0(E, a, bE, c);

4179

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4180

#endif // K0 > 8

4181

4182

lhs_offset += K0 * sizeof(DATA_TYPE);

4183

rhs_offset += K0 * rhs_stride_y;

4184

}

4185

4186

// Left-over accumulations

4187

for(; i < K; ++i)

4188

{

4189

// Load values from LHS matrix

4190

VEC_DATA_TYPE(DATA_TYPE, 2)

4191

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4192

#if M0 > 1

4193

VEC_DATA_TYPE(DATA_TYPE, 2)

4194

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4195

#endif // M0 > 1

4196

#if M0 > 2

4197

VEC_DATA_TYPE(DATA_TYPE, 2)

4198

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4199

#endif // M0 > 2

4200

#if M0 > 3

4201

VEC_DATA_TYPE(DATA_TYPE, 2)

4202

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4203

#endif // M0 > 3

4204

#if M0 > 4

4205

VEC_DATA_TYPE(DATA_TYPE, 2)

4206

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4207

#endif // M0 > 4

4208

#if M0 > 5

4209

VEC_DATA_TYPE(DATA_TYPE, 2)

4210

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4211

#endif // M0 > 5

4212

#if M0 > 6

4213

VEC_DATA_TYPE(DATA_TYPE, 2)

4214

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4215

#endif // M0 > 6

4216

#if M0 > 7

4217

VEC_DATA_TYPE(DATA_TYPE, 2)

4218

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4219

#endif // M0 > 7

4220

4221

VEC_DATA_TYPE(DATA_TYPE, N0)

4222

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4223

RHS_VFMA_M0xN0(0, a, b, c);

4224

4225

lhs_offset += sizeof(DATA_TYPE);

4226

rhs_offset += rhs_stride_y;

4227

}

4228

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4229

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4230

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4231

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4232

4233

#if defined(REINTERPRET_OUTPUT_AS_3D)

4234

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4235

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

4236

4237

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4238

// multiply dst_stride_z by DEPTH_GEMM3D

4239

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4240

4241

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4242

4243

// Add offset for batched GEMM

4244

dst_addr += z * dst_stride_z;

4245

4246

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4247

4248

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4249

#if defined(ALPHA)

4250

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4251

#endif // defined(ALPHA)

4252

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4253

// Add beta*bias

4254

#if defined(BETA)

4255

#if defined(BROADCAST_BIAS)

4256

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4257

4258

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4259

4260

#ifndef UNIT_BETA

4261

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4262

#endif // UNIT_BIAS

4263

4264

// c = c + bias[broadcasted]

4265

ADD_BLOCK_BROADCAST(M0, c, bias0);

4266

4267

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4268

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4269

4270

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4271

4272

#ifndef UNIT_BETA

4273

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4278

4279

#endif // defined(BROADCAST_BIAS)

4280

#endif // defined(BETA)

4281

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4282

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

4283

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4284

#endif // defined(ACTIVATION_TYPE)

4285

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

4286

const bool cond_y = y == 0;

4287

const bool cond_x = ((x + 1) * N0 >= N);

4288

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4289

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame^]

4290

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4291

4292

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4297

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4298

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4299

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4300

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4301

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4302

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4303

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4304

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4305

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4306

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4307

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4308

* The activation function is performed after the bias addition

4309

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4310

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4311

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4312

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4313

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4314

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4315

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4316

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4317

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4318

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4319

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4320

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4321

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4322

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4323

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4324

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4325

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4326

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4327

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4328

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4329

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4330

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4331

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4332

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4333

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4334

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4335

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4336

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4337

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4338

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4339

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4340

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4341

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4342

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4343

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4344

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4345

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

4346

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4347

#if defined(BETA)

4348

IMAGE_DECLARATION(src2),

4349

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4350

IMAGE_DECLARATION(dst),

4351

uint src0_stride_z,

4352

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4353

#if defined(BETA)

4354

uint src2_stride_z,

4355

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4356

uint dst_stride_z

4357

#if defined(REINTERPRET_OUTPUT_AS_3D)

4358

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4359

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4360

#endif // REINTERPRET_OUTPUT_AS_3D

4361

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4362

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4363

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4364

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4365

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4366

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4367

// Offset

4368

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4369

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4370

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4371

// src_addr_a = address of matrix A

4372

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4373

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4374

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4375

4376

#if defined(MATRIX_B_DEPTH)

4377

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4378

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4379

#else // defined(MATRIX_B_DEPTH)

4380

src1_addr_in_bytes += z * src1_stride_z;

4381

#endif // defined(MATRIX_B_DEPTH)

4382

4383

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4384

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4385

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4386

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4387

__global float *src_end_addr_b = src_addr_b + COLS_B;

4388

4389

src_addr_a += offset_row_a;

4390

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4391

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4392

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4397

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4398

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4399

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4400

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4401

float4 a0 = vload4(0, src_addr_a);

4402

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4403

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4404

c0 += (float4)a0.s0 * b0;

4405

c1 += (float4)a0.s1 * b0;

4406

c2 += (float4)a0.s2 * b0;

4407

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4408

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4409

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4410

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4411

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4412

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4413

c0 += (float4)a0.s0 * b0;

4414

c1 += (float4)a0.s1 * b0;

4415

c2 += (float4)a0.s2 * b0;

4416

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4417

}

4418

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4419

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4420

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4421

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4422

float4 a0 = vload4(0, src_addr_a);

4423

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4424

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4425

c0 += (float4)a0.s0 * b0;

4426

c1 += (float4)a0.s1 * b0;

4427

c2 += (float4)a0.s2 * b0;

4428

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4429

}

4430

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4431

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4432

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4433

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4434

// Compute dst address

4435

__global uchar *dst_addr = offset(&dst, 0, 0);

4436

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4437

uint4 zout = 0;

4438

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4439

#if defined(REINTERPRET_OUTPUT_AS_3D)

4440

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4441

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4442

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4447

// |******************|

4448

// | cross_plane_pad |

4449

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4454

4455

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4456

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4457

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4458

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4459

// Add offset due to the cross plane paddings

4460

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4461

4462

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4463

// multiply dst_stride_z by DEPTH_GEMM3D

4464

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4465

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4466

// Add offset for batched GEMM

4467

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4468

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4469

4470

// Multiply by the weight of matrix-matrix product and store the result

4471

#if defined(ALPHA)

4472

SCALE_BLOCK(4, float, c, ALPHA);

4473

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4478

4479

#if defined(BROADCAST_BIAS)

4480

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4481

4482

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4483

4484

#ifndef UNIT_BETA

4485

SCALE_BLOCK(1, float, bias, BETA);

4486

#endif // UNIT_BIAS

4487

4488

// c = c + bias[broadcasted]

4489

ADD_BLOCK_BROADCAST(4, c, bias0);

4490

4491

#else // defined(BROADCAST_BIAS)

4492

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4493

2) * src2_stride_z;

4494

4495

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4496

4497

#ifndef UNIT_BETA

4498

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4503

4504

#endif // defined(BROADCAST_BIAS)

4505

#endif // defined(BETA)

4506

4507

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

4508

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4509

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4510

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4511

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4512

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4513

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4514

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4515

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4516

}

4517

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4518

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4519

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4520

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4521

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4522

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4523

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4524

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4525

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4526

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4527

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4528

* The activation function is performed after the bias addition

4529

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4530

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4531

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4532

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4533

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4534

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4535

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4536

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4537

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4538

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4539

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4540

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4541

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4542

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4543

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4544

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4545

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4546

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4547

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4548

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4549

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4550

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4551

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4552

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4553

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4554

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4555

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4556

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4557

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4558

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4559

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4560

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4561

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4562

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4563

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4564

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4565

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

4566

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4567

#if defined(BETA)

4568

IMAGE_DECLARATION(src2),

4569

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4570

IMAGE_DECLARATION(dst),

4571

uint src0_stride_z,

4572

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4573

#if defined(BETA)

4574

uint src2_stride_z,

4575

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4576

uint dst_stride_z

4577

#if defined(REINTERPRET_OUTPUT_AS_3D)

4578

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4579

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4580

#endif // REINTERPRET_OUTPUT_AS_3D

4581

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4582

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4583

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4584

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4585

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4586

4587

// Offset

4588

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4589

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

4590

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4591

// src_addr_a = address of matrix A

4592

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4593

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4594

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4595

4596

#if defined(MATRIX_B_DEPTH)

4597

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4598

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4599

#else // defined(MATRIX_B_DEPTH)

4600

src1_addr_in_bytes += z * src1_stride_z;

4601

#endif // defined(MATRIX_B_DEPTH)

4602

4603

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4604

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4605

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4606

src_addr_a += offset_row_a;

4607

src_addr_b += offset_row_b;

4608

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4609

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4614

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4615

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

4616

4617

int i = 0;

4618

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4619

{

4620

// Load values from matrix A (interleaved) and matrix B (transposed)

4621

float4 a0 = vload4(0, src_addr_a);

4622

float4 b0 = vload4(0, src_addr_b);

4623

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4624

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4625

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4626

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4627

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4628

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4629

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4630

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4631

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4632

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4633

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4634

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4635

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4636

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4637

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4638

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4639

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4640

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4641

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4642

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4643

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4644

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4645

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4646

4647

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4648

a0 = vload4(0, src_addr_a);

4649

b0 = vload4(0, src_addr_b);

4650

4651

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4652

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4653

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4654

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4655

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4656

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4657

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4658

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4659

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4660

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4661

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4662

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4663

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4664

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4665

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4666

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4667

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4668

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4669

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4670

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4671

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4672

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4673

4674

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4675

a0 = vload4(0, src_addr_a);

4676

b0 = vload4(0, src_addr_b);

4677

4678

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4679

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4680

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4681

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4682

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4683

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4684

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4685

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4686

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4687

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4688

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4689

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4690

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4691

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4692

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4693

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4694

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4695

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4696

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4697

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4698

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4699

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4700

4701

// Load values from matrix A (interleaved) and matrix B (transposed)

4702

a0 = vload4(0, src_addr_a);

4703

b0 = vload4(0, src_addr_b);

4704

4705

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4706

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4707

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4708

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4709

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4710

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4711

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4712

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4713

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4714

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4715

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4716

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4717

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4718

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4719

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4720

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4721

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4722

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4723

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4724

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4725

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4726

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4727

}

4728

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4729

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4730

{

4731

// Load values from matrix A (interleaved) and matrix B (transposed)

4732

float4 a0 = vload4(0, src_addr_a);

4733

float4 b0 = vload4(0, src_addr_b);

4734

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4735

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4736

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4737

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4738

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4739

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4740

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4741

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4742

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4743

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4744

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4745

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4746

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4747

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4748

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4749

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4750

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4751

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4752

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4753

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4754

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4755

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4756

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4757

}

4758

4759

// Compute destination address

4760

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4761

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4762

// Compute dst address

4763

__global uchar *dst_addr = offset(&dst, 0, 0);

4764

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4765

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4766

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4767

#if defined(REINTERPRET_OUTPUT_AS_3D)

4768

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4769

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4770

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4775

// |******************|

4776

// | cross_plane_pad |

4777

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4782

4783

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4784

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4785

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4786

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4787

// Add offset due to the cross plane paddings

4788

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4789

4790

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4791

// multiply dst_stride_z by DEPTH_GEMM3D

4792

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4793

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4794

// Add offset for batched GEMM

4795

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4796

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4797

4798

// Multiply by the weight of matrix-matrix product and store the result

4799

#if defined(ALPHA)

4800

SCALE_BLOCK(4, float, c, ALPHA);

4801

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4806

4807

#if defined(BROADCAST_BIAS)

4808

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4809

4810

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4811

4812

#ifndef UNIT_BETA

4813

SCALE_BLOCK(1, float, bias, BETA);

4814

#endif // UNIT_BIAS

4815

4816

// c = c + bias[broadcasted]

4817

ADD_BLOCK_BROADCAST(4, c, bias0);

4818

4819

#else // defined(BROADCAST_BIAS)

4820

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4821

2) * src2_stride_z;

4822

4823

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4824

4825

#ifndef UNIT_BETA

4826

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4831

4832

#endif // defined(BROADCAST_BIAS)

4833

#endif // defined(BETA)

4834

4835

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

4836

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4837

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4838

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4839

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4840

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4841

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4842

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4843

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4844

}

4845

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4846

// Undefine local defines

4847

#undef COLS_MTX_B

4848

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4849

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4850

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4851

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4852

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4853

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4854

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4855

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4856

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4857

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4858

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4859

* The activation function is performed after the bias addition

4860

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4861

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4862

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4863

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4864

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4865

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4866

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4867

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4868

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4869

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4870

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4871

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4872

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4873

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4874

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4875

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4876

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4877

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4878

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4879

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4880

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4881

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4882

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4883

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4884

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4885

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4886

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4887

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4888

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4889

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4890

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4891

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4892

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4893

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4894

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4895

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4896

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

4897

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4898

#if defined(BETA)

4899

IMAGE_DECLARATION(src2),

4900

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4901

IMAGE_DECLARATION(dst),

4902

uint src0_stride_z,

4903

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4904

#if defined(BETA)

4905

uint src2_stride_z,

4906

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4907

uint dst_stride_z

4908

#if defined(REINTERPRET_OUTPUT_AS_3D)

4909

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4910

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4911

#endif // REINTERPRET_OUTPUT_AS_3D

4912

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4913

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4914

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4915

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4916

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4917

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4918

// Offset

4919

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4920

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4921

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4922

// src_addr_a = address of matrix A

4923

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4924

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4925

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4926

4927

#if defined(MATRIX_B_DEPTH)

4928

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4929

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4930

#else // defined(MATRIX_B_DEPTH)

4931

src1_addr_in_bytes += z * src1_stride_z;

4932

#endif // defined(MATRIX_B_DEPTH)

4933

4934

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4935

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4936

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4937

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4938

__global half *src_end_addr_b = src_addr_b + COLS_B;

4939

4940

src_addr_a += offset_row_a;

4941

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4942

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4943

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4948

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4949

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4950

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4951

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4952

half4 a0 = vload4(0, src_addr_a);

4953

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4954

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4955

c0 += (half8)a0.s0 * b0;

4956

c1 += (half8)a0.s1 * b0;

4957

c2 += (half8)a0.s2 * b0;

4958

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4959

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4960

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4961

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4962

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4963

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4964

c0 += (half8)a0.s0 * b0;

4965

c1 += (half8)a0.s1 * b0;

4966

c2 += (half8)a0.s2 * b0;

4967

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4968

}

4969

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4970

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4971

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4972

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4973

half4 a0 = vload4(0, src_addr_a);

4974

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4975

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4976

c0 += (half8)a0.s0 * b0;

4977

c1 += (half8)a0.s1 * b0;

4978

c2 += (half8)a0.s2 * b0;

4979

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4980

}

4981

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4982

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4983

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4984

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4985

// Compute dst address

4986

__global uchar *dst_addr = offset(&dst, 0, 0);

4987

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4988

uint4 zout = 0;

4989

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4990

#if defined(REINTERPRET_OUTPUT_AS_3D)

4991

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4992

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4993

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4998

// |******************|

4999

// | cross_plane_pad |

5000

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5005

5006

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5007

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5008

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5009

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5010

// Add offset due to the cross plane paddings

5011

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5012

5013

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5014

// multiply dst_stride_z by DEPTH_GEMM3D

5015

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5016

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5017

// Add offset for batched GEMM

5018

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5019

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5020

5021

// Multiply by the weight of matrix-matrix product and store the result

5022

#if defined(ALPHA)

5023

SCALE_BLOCK(4, half, c, ALPHA);

5024

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5029

5030

#if defined(BROADCAST_BIAS)

5031

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5032

5033

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5034

5035

#ifndef UNIT_BETA

5036

SCALE_BLOCK(1, half, bias, BETA);

5037

#endif // UNIT_BIAS

5038

5039

// c = c + bias[broadcasted]

5040

ADD_BLOCK_BROADCAST(4, c, bias0);

5041

5042

#else // defined(BROADCAST_BIAS)

5043

5044

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5045

2) * src2_stride_z;

5046

5047

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5048

5049

#ifndef UNIT_BETA

5050

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5055

5056

#endif // defined(BROADCAST_BIAS)

5057

#endif // defined(BETA)

5058

5059

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

5060

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5061

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5062

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5063

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5064

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5065

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5066

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5067

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5068

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5069

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5070

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5071

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5072

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5073

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5074

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5075

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5076

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5077

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5078

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5079

* The activation function is performed after the bias addition

5080

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5081

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5082

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5083

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5084

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5085

*

5086

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5087

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5088

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5089

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5090

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5091

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5092

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5093

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5094

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5095

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5096

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5097

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5098

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5099

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5100

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5101

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5102

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5103

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5104

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5105

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5106

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5107

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5108

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5109

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5110

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5111

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5112

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5113

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5114

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5115

*/

5116

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

5117

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5118

#if defined(BETA)

5119

IMAGE_DECLARATION(src2),

5120

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5121

IMAGE_DECLARATION(dst),

5122

uint src0_stride_z,

5123

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5124

#if defined(BETA)

5125

uint src2_stride_z,

5126

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5127

uint dst_stride_z

5128

#if defined(REINTERPRET_OUTPUT_AS_3D)

5129

,

5130

uint cross_plane_pad

5131

#endif // REINTERPRET_OUTPUT_AS_3D

5132

)

5133

{

5134

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5135

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5136

int z = get_global_id(2);

5137

5138

// Offset

5139

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5140

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5141

5142

// src_addr_a = address of matrix A

5143

// src_addr_b = address of matrix B

5144

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5145

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5146

5147

#if defined(MATRIX_B_DEPTH)

5148

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5149

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5150

#else // defined(MATRIX_B_DEPTH)

5151

src1_addr_in_bytes += z * src1_stride_z;

5152

#endif // defined(MATRIX_B_DEPTH)

5153

5154

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5155

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5156

5157

// Compute end row address for matrix B

5158

__global half *src_end_addr_b = src_addr_b + COLS_B;

5159

5160

src_addr_a += offset_row_a;

5161

src_addr_b += offset_row_b;

5162

5163

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5168

5169

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

5170

{

5171

// Load values from matrix A (interleaved) and matrix B (transposed)

5172

float4 a0 = convert_float4(vload4(0, src_addr_a));

5173

float8 b0 = convert_float8(vload8(0, src_addr_b));

5174

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5175

c0 += (float8)a0.s0 * b0;

5176

c1 += (float8)a0.s1 * b0;

5177

c2 += (float8)a0.s2 * b0;

5178

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5179

5180

// Load values from matrix A (interleaved) and matrix B (transposed)

5181

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

5182

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

5183

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5184

c0 += (float8)a0.s0 * b0;

5185

c1 += (float8)a0.s1 * b0;

5186

c2 += (float8)a0.s2 * b0;

5187

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5188

}

5189

5190

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

5191

{

5192

// Load values from matrix A (interleaved) and matrix B (transposed)

5193

float4 a0 = convert_float4(vload4(0, src_addr_a));

5194

float8 b0 = convert_float8(vload8(0, src_addr_b));

5195

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5196

c0 += (float8)a0.s0 * b0;

5197

c1 += (float8)a0.s1 * b0;

5198

c2 += (float8)a0.s2 * b0;

5199

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5200

}

5201

5202

// Compute destination address

5203

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5204

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5205

// Compute dst address

5206

__global uchar *dst_addr = offset(&dst, 0, 0);

5207

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5208

uint4 zout = 0;

5209

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5210

#if defined(REINTERPRET_OUTPUT_AS_3D)

5211

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5212

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5218

// |******************|

5219

// | cross_plane_pad |

5220

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5225

5226

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5227

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5228

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5229

5230

// Add offset due to the cross plane paddings

5231

zout *= (cross_plane_pad * dst_stride_y);

5232

5233

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5234

// multiply dst_stride_z by DEPTH_GEMM3D

5235

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5236

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5237

// Add offset for batched GEMM

5238

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5239

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5240

5241

// Multiply by the weight of matrix-matrix product and store the result

5242

#if defined(ALPHA)

5243

SCALE_BLOCK(4, float, c, ALPHA);

5244

#endif // defined(ALPHA)

5245

5246

#if defined(BETA)

5247

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5248

5249

#if defined(BROADCAST_BIAS)

5250

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5251

5252

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5253

5254

float8 bias_f0 = convert_float8(bias0);

5255

5256

#ifndef UNIT_BETA

5257

SCALE_BLOCK(1, float, bias_f, BETA);

5258

#endif // UNIT_BIAS

5259

5260

// c = c + bias[broadcasted]

5261

ADD_BLOCK_BROADCAST(4, c, bias_f0);

5262

5263

#else // defined(BROADCAST_BIAS)

5264

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5265

2) * src2_stride_z;

5266

5267

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5268

5269

float8 bias_f0 = convert_float8(bias0);

5270

float8 bias_f1 = convert_float8(bias1);

5271

float8 bias_f2 = convert_float8(bias2);

5272

float8 bias_f3 = convert_float8(bias3);

5273

5274

#ifndef UNIT_BETA

5275

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

5280

5281

#endif // defined(BROADCAST_BIAS)

5282

#endif // defined(BETA)

5283

5284

half8 c_h0 = convert_half8(c0);

5285

half8 c_h1 = convert_half8(c1);

5286

half8 c_h2 = convert_half8(c2);

5287

half8 c_h3 = convert_half8(c3);

5288

5289

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

5290

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5291

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5292

5293

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5294

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5295

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5296

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5297

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5298

}

5299

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5300

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5301

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5302

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5303

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5304

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5305

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5306

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5307

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5308

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5309

* The activation function is performed after the bias addition

5310

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5311

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5312

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5313

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5314

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5315

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5316

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5317

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5318

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5319

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5320

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5321

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5322

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5323

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5324

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5325

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5326

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5327

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5328

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5329

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5330

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5331

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5332

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5333

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5334

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5335

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5336

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5337

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5338

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5339

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5340

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5341

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5342

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5343

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5344

*/

5345

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

5346

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5347

#if defined(BETA)

5348

IMAGE_DECLARATION(src2),

5349

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5350

IMAGE_DECLARATION(dst),

5351

uint src0_stride_z,

5352

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5353

#if defined(BETA)

5354

uint src2_stride_z,

5355

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5356

uint dst_stride_z

5357

#if defined(REINTERPRET_OUTPUT_AS_3D)

5358

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5359

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5360

#endif // REINTERPRET_OUTPUT_AS_3D

5361

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5362

{

5363

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5364

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5365

int z = get_global_id(2);

5366

5367

// Offset

5368

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5369

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5370

5371

// src_addr_a = address of matrix A

5372

// src_addr_b = address of matrix B

5373

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5374

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5375

5376

#if defined(MATRIX_B_DEPTH)

5377

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5378

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5379

#else // defined(MATRIX_B_DEPTH)

5380

src1_addr_in_bytes += z * src1_stride_z;

5381

#endif // defined(MATRIX_B_DEPTH)

5382

5383

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5384

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5385

5386

// Compute end row address for matrix B

5387

__global half *src_end_addr_b = src_addr_b + COLS_B;

5388

5389

src_addr_a += offset_row_a;

5390

src_addr_b += offset_row_b;

5391

5392

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5397

5398

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

5399

5400

int i = 0;

5401

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

5402

{

5403

#if MULT_INTERLEAVE4X4_HEIGHT == 1

5404

// Load values from matrix A (interleaved) and matrix B (transposed)

5405

half8 a0 = vload8(0, src_addr_a);

5406

half8 b0 = vload8(0, src_addr_b);

5407

5408

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5409

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5410

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5411

c0 = fma((half8)a0.s0, b0, c0);

5412

c1 = fma((half8)a0.s1, b0, c1);

5413

c2 = fma((half8)a0.s2, b0, c2);

5414

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5415

5416

// Load values from matrix B (transposed)

5417

b0 = vload8(0, src_addr_b);

5418

5419

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5420

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5421

c0 = fma((half8)a0.s4, b0, c0);

5422

c1 = fma((half8)a0.s5, b0, c1);

5423

c2 = fma((half8)a0.s6, b0, c2);

5424

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5425

5426

// Load values from matrix A (interleaved) and matrix B (transposed)

5427

a0 = vload8(0, src_addr_a);

5428

b0 = vload8(0, src_addr_b);

5429

5430

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5431

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5432

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5433

c0 = fma((half8)a0.s0, b0, c0);

5434

c1 = fma((half8)a0.s1, b0, c1);

5435

c2 = fma((half8)a0.s2, b0, c2);

5436

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5437

5438

// Load values from matrix B (transposed)

5439

b0 = vload8(0, src_addr_b);

5440

5441

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5442

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5443

c0 = fma((half8)a0.s4, b0, c0);

5444

c1 = fma((half8)a0.s5, b0, c1);

5445

c2 = fma((half8)a0.s6, b0, c2);

5446

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5447

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

5448

// Load values from matrix A (interleaved) and matrix B (transposed)

5449

half4 a0 = vload4(0, src_addr_a);

5450

half8 b0 = vload8(0, src_addr_b);

5451

5452

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5453

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5454

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5455

c0 = fma((half8)a0.s0, b0, c0);

5456

c1 = fma((half8)a0.s1, b0, c1);

5457

c2 = fma((half8)a0.s2, b0, c2);

5458

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5459

5460

// Load values from matrix A (interleaved) and matrix B (transposed)

5461

a0 = vload4(0, src_addr_a);

5462

b0 = vload8(0, src_addr_b);

5463

5464

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5465

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5466

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5467

c0 = fma((half8)a0.s0, b0, c0);

5468

c1 = fma((half8)a0.s1, b0, c1);

5469

c2 = fma((half8)a0.s2, b0, c2);

5470

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5471

5472

// Load values from matrix A (interleaved) and matrix B (transposed)

5473

a0 = vload4(0, src_addr_a);

5474

b0 = vload8(0, src_addr_b);

5475

5476

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5477

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5478

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5479

c0 = fma((half8)a0.s0, b0, c0);

5480

c1 = fma((half8)a0.s1, b0, c1);

5481

c2 = fma((half8)a0.s2, b0, c2);

5482

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5483

5484

// Load values from matrix A (interleaved) and matrix B (transposed)

5485

a0 = vload4(0, src_addr_a);

5486

b0 = vload8(0, src_addr_b);

5487

5488

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5489

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5490

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5491

c0 = fma((half8)a0.s0, b0, c0);

5492

c1 = fma((half8)a0.s1, b0, c1);

5493

c2 = fma((half8)a0.s2, b0, c2);

5494

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5495

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

5496

}

5497

5498

for(; i < (int)(COLS_MTX_B); ++i)

5499

{

5500

// Load values from matrix A (interleaved) and matrix B (transposed)

5501

half4 a0 = vload4(0, src_addr_a);

5502

half8 b0 = vload8(0, src_addr_b);

5503

5504

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5505

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5506

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5507

c0 = fma((half8)a0.s0, b0, c0);

5508

c1 = fma((half8)a0.s1, b0, c1);

5509

c2 = fma((half8)a0.s2, b0, c2);

5510

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5511

}

5512

5513

// Compute destination address

5514

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5515

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5516

// Compute dst address

5517

__global uchar *dst_addr = offset(&dst, 0, 0);

5518

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5519

uint4 zout = 0;

5520

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5521

#if defined(REINTERPRET_OUTPUT_AS_3D)

5522

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5523

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5524

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5529

// |******************|

5530

// | cross_plane_pad |

5531

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5536

5537

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5538

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5539

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5540

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5541

// Add offset due to the cross plane paddings

5542

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5543

5544

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5545

// multiply dst_stride_z by DEPTH_GEMM3D

5546

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5547

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5548

// Add offset for batched GEMM

5549

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5550

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5551

5552

// Multiply by the weight of matrix-matrix product and store the result

5553

#if defined(ALPHA)

5554

SCALE_BLOCK(4, half, c, ALPHA);

5555

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5560

5561

#if defined(BROADCAST_BIAS)

5562

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5563

5564

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5565

5566

#ifndef UNIT_BETA

5567

SCALE_BLOCK(1, half, bias, BETA);

5568

#endif // UNIT_BIAS

5569

5570

// c = c + bias[broadcasted]

5571

ADD_BLOCK_BROADCAST(4, c, bias0);

5572

5573

#else // defined(BROADCAST_BIAS)

5574

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5575

2) * src2_stride_z;

5576

5577

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5578

5579

#ifndef UNIT_BETA

5580

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5585

5586

#endif // defined(BROADCAST_BIAS)

5587

#endif // defined(BETA)

5588

5589

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

5590

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5591

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5592

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5593

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5594

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5595

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5596

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5597

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5598

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

5599

5600

// Undefine local defines

5601

#undef COLS_MTX_B

5602

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

5603

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5604

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

5605

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5606

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5607

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

5608

#if defined(DATA_TYPE)

5609

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5610

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

5611

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5612

* @note This OpenCL kernel works with floating point data types (F16/F32)

5613

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

5614

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5615

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5616

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5617

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5618

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5619

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5620

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5621

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5622

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5623

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5624

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5625

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5626

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5627

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5628

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5629

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5630

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5631

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5632

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5633

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5634

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5635

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5636

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5637

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5638

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5639

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5640

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5641

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5642

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5643

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5644

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5645

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5646

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5647

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5648

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5649

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5650

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5651

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5652

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5653

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5654

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5655

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5656

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5657

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5658

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5659

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

5660

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5661

#if defined(BETA)

5662

IMAGE_DECLARATION(src2),

5663

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5664

IMAGE_DECLARATION(dst),

5665

uint src0_stride_z,

5666

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5667

#if defined(BETA)

5668

uint src2_stride_z,

5669

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5670

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5671

#if defined(REINTERPRET_INPUT_AS_3D)

5672

,

5673

uint src_cross_plane_pad

5674

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5675

#if defined(REINTERPRET_OUTPUT_AS_3D)

5676

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5677

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5678

#endif // REINTERPRET_OUTPUT_AS_3D

5679

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5680

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5681

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5682

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5683

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5684

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5685

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5686

// Update address for the matrix A

5687

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5688

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5689

// Update address for the matrix B

5690

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5691

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5692

#if defined(REINTERPRET_INPUT_AS_3D)

5693

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5694

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5700

// |******************|

5701

// | cross_plane_pad |

5702

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5707

5708

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5709

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5710

zin = min(DEPTH_GEMM3D - 1, zin);

5711

5712

// Add offset due to the cross plane paddings

5713

zin *= (src_cross_plane_pad * src0_stride_y);

5714

5715

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5716

// multiply src0_stride_z by DEPTH_GEMM3D

5717

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5718

5719

#else // defined(REINTERPRET_INPUT_AS_3D)

5720

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5721

// Add offset for batched GEMM

5722

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5723

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5724

#endif // defined(REINTERPRET_INPUT_AS_3D)

5725

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5726

#if defined(MATRIX_B_DEPTH)

5727

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5728

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5729

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5730

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5731

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5732

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5733

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

5734

5735

VECTOR_TYPE acc0 = 0.0f;

5736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5737

VECTOR_TYPE acc1 = 0.0f;

5738

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5739

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5740

VECTOR_TYPE acc2 = 0.0f;

5741

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5742

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5743

VECTOR_TYPE acc3 = 0.0f;

5744

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5745

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

5746

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5747

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5748

#if defined(REINTERPRET_INPUT_AS_3D)

5749

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5750

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5751

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5752

// Load values from matrix A

5753

VEC_DATA_TYPE(DATA_TYPE, 2)

5754

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5755

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5756

VEC_DATA_TYPE(DATA_TYPE, 2)

5757

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5758

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5759

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5760

VEC_DATA_TYPE(DATA_TYPE, 2)

5761

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5762

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5763

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5764

VEC_DATA_TYPE(DATA_TYPE, 2)

5765

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5766

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5767

#endif // defined(REINTERPRET_INPUT_AS_3D)

5768

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5769

// Load values from matrix B

5770

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

5771

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5772

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5773

// Accumulate

5774

acc0 += b0 * (VECTOR_TYPE)a0.s0;

5775

acc0 += b1 * (VECTOR_TYPE)a0.s1;

5776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5777

acc1 += b0 * (VECTOR_TYPE)a1.s0;

5778

acc1 += b1 * (VECTOR_TYPE)a1.s1;

5779

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5780

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5781

acc2 += b0 * (VECTOR_TYPE)a2.s0;

5782

acc2 += b1 * (VECTOR_TYPE)a2.s1;

5783

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5784

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5785

acc3 += b0 * (VECTOR_TYPE)a3.s0;

5786

acc3 += b1 * (VECTOR_TYPE)a3.s1;

5787

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5788

}

5789

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5790

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5791

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5792

#if defined(REINTERPRET_INPUT_AS_3D)

5793

// Load values from matrix A

5794

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5795

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5796

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5797

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5798

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5799

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5800

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5801

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5802

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5803

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5804

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5805

// Load values from matrix A

5806

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5807

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5808

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5809

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5810

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5811

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5812

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5813

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5814

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5815

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5816

#endif // defined(REINTERPRET_INPUT_AS_3D)

5817

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5818

// Load values from matrix B

5819

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5820

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5821

// Accumulate

5822

acc0 += b0 * (VECTOR_TYPE)a0;

5823

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5824

acc1 += b0 * (VECTOR_TYPE)a1;

5825

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5826

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5827

acc2 += b0 * (VECTOR_TYPE)a2;

5828

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5829

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5830

acc3 += b0 * (VECTOR_TYPE)a3;

5831

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5832

}

5833

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5834

int z = get_global_id(2);

5835

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5836

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5837

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5838

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5839

// Compute dst address

5840

__global uchar *dst_addr = offset(&dst, 0, 0);

5841

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5842

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5843

5844

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5845

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5846

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5847

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5848

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5853

// |******************|

5854

// | cross_plane_pad |

5855

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5860

5861

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5862

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5863

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5864

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5865

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5866

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5867

5868

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5869

// multiply dst_stride_z by DEPTH_GEMM3D

5870

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5871

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5872

// Add offset for batched GEMM

5873

dst_addr += z * dst_stride_z;

5874

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5875

5876

// Multiply by the weight of matrix-matrix product and store the result

5877

#if defined(ALPHA)

5878

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

5879

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5884

5885

#if defined(BROADCAST_BIAS)

5886

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

5887

5888

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5889

5890

#ifndef UNIT_BETA

5891

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

5892

#endif // UNIT_BIAS

5893

5894

// c = c + bias[broadcasted]

5895

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5896

5897

#else // defined(BROADCAST_BIAS)

5898

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

5899

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5900

5901

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5902

5903

#ifndef UNIT_BETA

5904

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5909

5910

#endif // defined(BROADCAST_BIAS)

5911

#endif // defined(BETA)

5912

5913

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

5914

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5915

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5916

5917

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5918

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5919

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5920

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5921

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

5922

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5923

*

5924

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5925

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5926

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5927

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5928

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5929

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5930

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5931

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5932

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5933

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5934

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5935

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5936

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5937

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5938

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5939

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5940

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5941

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5942

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5943

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5944

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5945

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5946

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5947

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5948

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5949

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5950

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5951

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5952

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5953

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5954

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5955

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5956

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5957

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5958

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5959

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5960

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5961

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5962

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5963

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5964

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5965

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5966

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5967

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5968

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5969

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5970

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5971

*/

5972

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

5973

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5974

#if defined(BETA)

5975

IMAGE_DECLARATION(src2),

5976

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5977

IMAGE_DECLARATION(dst),

5978

uint src0_stride_z,

5979

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5980

#if defined(BETA)

5981

uint src2_stride_z,

5982

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5983

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5984

#if defined(REINTERPRET_INPUT_AS_3D)

5985

,

5986

uint src_cross_plane_pad

5987

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5988

#if defined(REINTERPRET_OUTPUT_AS_3D)

5989

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5990

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5991

#endif // REINTERPRET_OUTPUT_AS_3D

5992

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5993

{

5994

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5995

5996

// Compute starting address for matrix A and matrix B

5997

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5998

5999

// Update address for matrix A

6000

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6001

6002

// Update address for matrix B

6003

src_addr.s1 += idx * sizeof(float);

6004

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6005

#if defined(REINTERPRET_INPUT_AS_3D)

6006

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6007

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6013

// |******************|

6014

// | cross_plane_pad |

6015

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6020

6021

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6022

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6023

zin = min(DEPTH_GEMM3D - 1, zin);

6024

6025

// Add offset due to the cross plane paddings

6026

zin *= (src_cross_plane_pad * src0_stride_y);

6027

6028

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6029

// multiply src0_stride_z by DEPTH_GEMM3D

6030

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6031

6032

#else // defined(REINTERPRET_INPUT_AS_3D)

6033

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6034

// Add offset for batched GEMM

6035

src_addr.s0 += get_global_id(2) * src0_stride_z;

6036

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6037

#endif // defined(REINTERPRET_INPUT_AS_3D)

6038

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6039

#if defined(MATRIX_B_DEPTH)

6040

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6041

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6042

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6043

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6044

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6045

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6046

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6047

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6048

6049

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6050

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6051

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6052

6053

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6054

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6055

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6056

6057

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6058

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6059

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6060

6061

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6062

int i = 0;

6063

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6064

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6065

#if defined(REINTERPRET_INPUT_AS_3D)

6066

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6067

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6068

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6069

// Load values from matrix A and matrix B

6070

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6071

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6072

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6073

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6074

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6075

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6076

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6077

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6078

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6079

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6080

#endif // defined(REINTERPRET_INPUT_AS_3D)

6081

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6082

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6083

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6084

6085

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6086

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6087

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6088

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

6089

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6090

6091

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6092

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6093

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

6094

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

6095

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

6096

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6097

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6098

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6099

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6100

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6101

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

6102

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

6103

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

6104

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6105

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6106

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6107

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6108

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6109

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

6110

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

6111

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

6112

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6113

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6114

6115

// Load values from matrix A and matrix B

6116

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6117

src_addr.s1 += src1_stride_y;

6118

6119

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6120

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

6121

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

6122

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

6123

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6124

6125

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6126

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6127

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

6128

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

6129

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

6130

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6131

6132

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6133

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6134

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6135

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

6136

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

6137

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

6138

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6139

6140

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6141

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6142

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6143

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

6144

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

6145

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

6146

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6147

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6148

6149

// Load values from matrix A and matrix B

6150

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6151

src_addr.s1 += src1_stride_y;

6152

6153

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6154

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

6155

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

6156

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

6157

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6158

6159

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6160

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6161

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

6162

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

6163

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

6164

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6165

6166

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6167

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6168

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6169

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

6170

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

6171

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

6172

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6173

6174

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6175

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6176

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6177

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

6178

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

6179

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

6180

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6181

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6182

6183

// Load values from matrix A and matrix B

6184

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6185

src_addr.s1 += src1_stride_y;

6186

6187

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6188

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

6189

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

6190

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

6191

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6192

6193

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6194

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6195

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

6196

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

6197

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

6198

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6199

6200

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6201

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6202

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6203

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

6204

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

6205

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

6206

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6207

6208

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6209

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6210

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6211

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

6212

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

6213

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

6214

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6215

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6216

6217

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6218

}

6219

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6220

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6221

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6222

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6223

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6224

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6226

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6229

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6230

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6231

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6232

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6233

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6234

#else // defined(REINTERPRET_INPUT_AS_3D)

6235

// Load values from matrix A

6236

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6237

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6238

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6239

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6240

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6241

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6242

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6243

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6244

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6245

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6246

#endif // defined(REINTERPRET_INPUT_AS_3D)

6247

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6248

// Load values from matrix B

6249

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6250

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6251

6252

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6253

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6254

acc0.s1 = fma(a0, b0.s1, acc0.s1);

6255

acc0.s2 = fma(a0, b0.s2, acc0.s2);

6256

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6257

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6258

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6259

acc1.s1 = fma(a1, b0.s1, acc1.s1);

6260

acc1.s2 = fma(a1, b0.s2, acc1.s2);

6261

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6262

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6263

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6264

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6265

acc2.s1 = fma(a2, b0.s1, acc2.s1);

6266

acc2.s2 = fma(a2, b0.s2, acc2.s2);

6267

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6268

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6269

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6270

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6271

acc3.s1 = fma(a3, b0.s1, acc3.s1);

6272

acc3.s2 = fma(a3, b0.s2, acc3.s2);

6273

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6274

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6275

6276

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6277

}

6278

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6279

int z = get_global_id(2);

6280

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6281

// Compute destination address

6282

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6283

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6284

// Compute dst address

6285

__global uchar *dst_addr = offset(&dst, 0, 0);

6286

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6287

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6288

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6289

#if defined(REINTERPRET_OUTPUT_AS_3D)

6290

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6291

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6292

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6297

// |******************|

6298

// | cross_plane_pad |

6299

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6304

6305

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6306

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6307

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6308

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6309

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6310

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6311

6312

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6313

// multiply dst_stride_z by DEPTH_GEMM3D

6314

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6315

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6316

// Add offset for batched GEMM

6317

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6318

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6319

6320

// Multiply by the weight of matrix-matrix product and store the result

6321

#if defined(ALPHA)

6322

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6323

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6328

6329

#if defined(BROADCAST_BIAS)

6330

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

6331

6332

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6333

6334

#ifndef UNIT_BETA

6335

SCALE_BLOCK(1, float, bias, BETA);

6336

#endif // UNIT_BIAS

6337

6338

// acc = acc + bias[broadcasted]

6339

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6340

6341

#else // defined(BROADCAST_BIAS)

6342

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

6343

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6344

6345

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6346

6347

#ifndef UNIT_BETA

6348

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6353

6354

#endif // defined(BROADCAST_BIAS)

6355

#endif // defined(BETA)

6356

6357

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

6358

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6359

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6360

6361

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6362

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6363

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6364

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6365

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6366

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6367

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6368

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6369

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6370

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6371

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6372

}

6373

6374

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

6375

*

6376

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

6377

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

6378

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6379

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

6380

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6381

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6382

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6383

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6384

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6385

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6386

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6387

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6388

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6389

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6390

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6391

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6392

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6393

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6394

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6395

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6396

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6397

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6398

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6399

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6400

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6401

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6402

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6403

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6404

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6405

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6406

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6407

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6408

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6409

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6410

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6411

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6412

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6413

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6414

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6415

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6416

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6417

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6418

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6419

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6420

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6421

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6422

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6423

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6424

*/

6425

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

6426

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6427

#if defined(BETA)

6428

IMAGE_DECLARATION(src2),

6429

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6430

IMAGE_DECLARATION(dst),

6431

uint src0_stride_z,

6432

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6433

#if defined(BETA)

6434

uint src2_stride_z,

6435

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6436

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6437

#if defined(REINTERPRET_INPUT_AS_3D)

6438

,

6439

uint src_cross_plane_pad

6440

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6441

#if defined(REINTERPRET_OUTPUT_AS_3D)

6442

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6443

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6444

#endif // REINTERPRET_OUTPUT_AS_3D

6445

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6446

{

6447

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6448

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6449

6450

// Compute starting address for matrix A and Matrix B

6451

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6452

6453

// Update address for the matrix A

6454

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6455

6456

// Update address for the matrix B

6457

src_addr.s1 += idx * sizeof(float);

6458

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6459

#if defined(REINTERPRET_INPUT_AS_3D)

6460

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6461

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6467

// |******************|

6468

// | cross_plane_pad |

6469

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6474

6475

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6476

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6477

zin = min(DEPTH_GEMM3D - 1, zin);

6478

6479

// Add offset due to the cross plane paddings

6480

zin *= (src_cross_plane_pad * src0_stride_y);

6481

6482

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6483

// multiply src0_stride_z by DEPTH_GEMM3D

6484

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6485

6486

#else // defined(REINTERPRET_INPUT_AS_3D)

6487

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6488

// Add offset for batched GEMM

6489

src_addr.s0 += get_global_id(2) * src0_stride_z;

6490

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6491

#endif // defined(REINTERPRET_INPUT_AS_3D)

6492

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6493

#if defined(MATRIX_B_DEPTH)

6494

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6495

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6496

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6497

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6498

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6499

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6500

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6501

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6502

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6503

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6504

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6505

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6506

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6507

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6508

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6509

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6510

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6511

6512

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6513

int i = 0;

6514

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6515

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6516

#if defined(REINTERPRET_INPUT_AS_3D)

6517

// Load values from matrix A

6518

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

6519

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6520

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6521

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6522

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6523

6524

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6525

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6526

src_addr.s1 += src1_stride_y;

6527

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6528

src_addr.s1 += src1_stride_y;

6529

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6530

src_addr.s1 += src1_stride_y;

6531

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6532

src_addr.s1 += src1_stride_y;

6533

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6534

src_addr.s1 += src1_stride_y;

6535

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6536

src_addr.s1 += src1_stride_y;

6537

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6538

src_addr.s1 += src1_stride_y;

6539

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6540

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6541

6542

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6543

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6544

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

6545

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

6546

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

6547

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

6548

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

6549

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

6550

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6551

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6552

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6553

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

6554

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

6555

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

6556

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

6557

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

6558

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

6559

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6560

6561

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6562

#if defined(REINTERPRET_INPUT_AS_3D)

6563

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6564

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6565

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6566

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6567

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

6568

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

6569

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

6570

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

6571

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

6572

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

6573

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

6574

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6575

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6576

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

6577

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

6578

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

6579

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

6580

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

6581

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

6582

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

6583

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6584

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6585

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6586

#if defined(REINTERPRET_INPUT_AS_3D)

6587

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6588

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6589

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6590

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6591

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

6592

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

6593

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

6594

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

6595

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

6596

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

6597

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

6598

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6599

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6600

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

6601

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

6602

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

6603

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

6604

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

6605

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

6606

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

6607

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6608

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6609

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6610

#if defined(REINTERPRET_INPUT_AS_3D)

6611

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6612

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6613

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6614

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6615

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

6616

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

6617

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

6618

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

6619

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

6620

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

6621

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

6622

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6623

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6624

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

6625

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

6626

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

6627

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

6628

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

6629

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

6630

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

6631

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6632

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6633

6634

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6635

}

6636

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6637

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6638

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6639

#if defined(REINTERPRET_INPUT_AS_3D)

6640

// Load values from matrix A

6641

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6642

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6643

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6644

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6645

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6646

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6647

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6648

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6649

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6650

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6651

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6652

// Load values from matrix A

6653

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6654

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6655

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6656

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6657

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6658

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6659

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6660

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6661

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6662

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6663

#endif // defined(REINTERPRET_INPUT_AS_3D)

6664

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6665

// Load values from matrix B

6666

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6667

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6668

6669

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6670

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6671

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6672

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6673

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6674

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6675

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6676

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6677

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6678

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6679

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6680

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6681

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6682

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6683

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6684

6685

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6686

}

6687

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6688

int z = get_global_id(2);

6689

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6690

// Compute destination address

6691

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6692

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6693

// Compute dst address

6694

__global uchar *dst_addr = offset(&dst, 0, 0);

6695

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6696

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6697

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6698

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6699

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6700

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6701

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6702

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6707

// |******************|

6708

// | cross_plane_pad |

6709

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6714

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6715

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6716

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6717

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6718

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6719

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6720

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6721

6722

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6723

// multiply dst_stride_z by DEPTH_GEMM3D

6724

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6725

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6726

// Add offset for batched GEMM

6727

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6728

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6729

6730

// Multiply by the weight of matrix-matrix product and store the result

6731

#if defined(ALPHA)

6732

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6733

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6738

6739

#if defined(BROADCAST_BIAS)

6740

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

6741

6742

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6743

6744

#ifndef UNIT_BETA

6745

SCALE_BLOCK(1, float, bias, BETA);

6746

#endif // UNIT_BIAS

6747

6748

// acc = acc + bias[broadcasted]

6749

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6750

6751

#else // defined(BROADCAST_BIAS)

6752

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

6753

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6754

6755

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6756

6757

#ifndef UNIT_BETA

6758

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6763

6764

#endif // defined(BROADCAST_BIAS)

6765

#endif // defined(BETA)

6766

6767

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

6768

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6769

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6770

6771

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6772

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6774

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6777

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6778

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6780

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6782

}

6783

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6784

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6785

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

6786

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6787

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

6788

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6789

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

6790

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6791

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6792

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6793

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6794

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6795

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6796

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6797

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6798

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

6799

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6800

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6801

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6802

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6803

*

6804

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

6805

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6806

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6807

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6808

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6809

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6810

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6811

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6812

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6813

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6814

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6815

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6816

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6817

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6818

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6819

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6820

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6821

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6822

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6823

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6824

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6825

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6826

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6827

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6828

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6829

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6830

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6831

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6832

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6833

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

6834

*/

6835

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

6836

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6837

#if defined(BETA)

6838

IMAGE_DECLARATION(src2),

6839

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6840

IMAGE_DECLARATION(dst),

6841

uint src0_stride_z,

6842

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6843

#if defined(BETA)

6844

uint src2_stride_z,

6845

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6846

uint dst_stride_z

6847

#if defined(REINTERPRET_INPUT_AS_3D)

6848

,

6849

uint src_cross_plane_pad

6850

#endif // REINTERPRET_INPUT_AS_3D

6851

#if defined(REINTERPRET_OUTPUT_AS_3D)

6852

,

6853

uint dst_cross_plane_pad

6854

#endif // REINTERPRET_OUTPUT_AS_3D

6855

)

6856

{

6857

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6858

6859

// Compute starting address for matrix A and Matrix B

6860

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6861

6862

// Update address for the matrix A

6863

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6864

6865

// Update address for the matrix B

6866

src_addr.s1 += idx * sizeof(half);

6867

6868

#if defined(REINTERPRET_INPUT_AS_3D)

6869

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6870

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6876

// |******************|

6877

// | cross_plane_pad |

6878

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6883

6884

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6885

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6886

zin = min(DEPTH_GEMM3D - 1, zin);

6887

6888

// Add offset due to the cross plane paddings

6889

zin *= (src_cross_plane_pad * src0_stride_y);

6890

6891

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6892

// multiply src0_stride_z by DEPTH_GEMM3D

6893

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6894

6895

#else // defined(REINTERPRET_INPUT_AS_3D)

6896

6897

// Add offset for batched GEMM

6898

src_addr.s0 += get_global_id(2) * src0_stride_z;

6899

6900

#endif // defined(REINTERPRET_INPUT_AS_3D)

6901

6902

#if defined(MATRIX_B_DEPTH)

6903

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6904

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6905

#else // defined(MATRIX_B_DEPTH)

6906

src_addr.s1 += get_global_id(2) * src1_stride_z;

6907

#endif // defined(MATRIX_B_DEPTH)

6908

6909

float8 acc0 = 0.0h;

6910

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6911

float8 acc1 = 0.0h;

6912

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6913

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6914

float8 acc2 = 0.0h;

6915

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6916

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6917

float8 acc3 = 0.0h;

6918

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6919

6920

int i = 0;

6921

for(; i <= ((int)COLS_A - 4); i += 4)

6922

{

6923

#if defined(REINTERPRET_INPUT_AS_3D)

6924

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6925

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6926

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6927

// Load values from matrix A

6928

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6929

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6930

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6931

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6932

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6933

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6934

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6935

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6936

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6937

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6938

#endif // defined(REINTERPRET_INPUT_AS_3D)

6939

6940

// Load values from matrix B

6941

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6942

src_addr.s1 += src1_stride_y;

6943

6944

// Accumulate

6945

acc0 = fma(b0, (float8)a0.s0, acc0);

6946

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6947

acc1 = fma(b0, (float8)a1.s0, acc1);

6948

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6949

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6950

acc2 = fma(b0, (float8)a2.s0, acc2);

6951

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6952

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6953

acc3 = fma(b0, (float8)a3.s0, acc3);

6954

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6955

6956

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6957

src_addr.s1 += src1_stride_y;

6958

acc0 = fma(b0, (float8)a0.s1, acc0);

6959

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6960

acc1 = fma(b0, (float8)a1.s1, acc1);

6961

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6962

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6963

acc2 = fma(b0, (float8)a2.s1, acc2);

6964

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6965

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6966

acc3 = fma(b0, (float8)a3.s1, acc3);

6967

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6968

6969

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6970

src_addr.s1 += src1_stride_y;

6971

acc0 = fma(b0, (float8)a0.s2, acc0);

6972

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6973

acc1 = fma(b0, (float8)a1.s2, acc1);

6974

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6975

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6976

acc2 = fma(b0, (float8)a2.s2, acc2);

6977

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6978

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6979

acc3 = fma(b0, (float8)a3.s2, acc3);

6980

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6981

6982

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6983

src_addr.s1 += src1_stride_y;

6984

acc0 = fma(b0, (float8)a0.s3, acc0);

6985

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6986

acc1 = fma(b0, (float8)a1.s3, acc1);

6987

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6988

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6989

acc2 = fma(b0, (float8)a2.s3, acc2);

6990

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6991

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6992

acc3 = fma(b0, (float8)a3.s3, acc3);

6993

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6994

6995

src_addr.s0 += 4 * sizeof(half);

6996

}

6997

6998

for(; i < (int)COLS_A; ++i)

6999

{

7000

#if defined(REINTERPRET_INPUT_AS_3D)

7001

// Load values from matrix A

7002

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7003

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7004

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7005

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7006

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7007

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7008

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7010

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7012

#else // defined(REINTERPRET_INPUT_AS_3D)

7013

// Load values from matrix A

7014

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7015

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7016

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7017

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7018

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7019

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7020

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7021

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7022

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7023

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7024

#endif // defined(REINTERPRET_INPUT_AS_3D)

7025

7026

// Load values from matrix B

7027

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

7028

7029

src_addr += (int2)(sizeof(half), src1_stride_y);

7030

7031

// Accumulate

7032

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

7033

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7034

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

7035

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7036

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7037

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

7038

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7039

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7040

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

7041

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7042

}

7043

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7044

int z = get_global_id(2);

7045

7046

// Compute destination address

7047

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7048

7049

// Compute dst address

7050

__global uchar *dst_addr = offset(&dst, 0, 0);

7051

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7052

uint4 zout = 0;

7053

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7054

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7055

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7056

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

7057

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7063

// |******************|

7064

// | cross_plane_pad |

7065

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7070

7071

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7072

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7073

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7074

7075

// Add offset due to the cross plane paddings

7076

zout *= (dst_cross_plane_pad * dst_stride_y);

7077

7078

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7079

// multiply dst_stride_z by DEPTH_GEMM3D

7080

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7081

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7082

// Add offset for batched GEMM

7083

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7084

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7085

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7086

// Multiply by the weight of matrix-matrix product and store the result

7087

#if defined(ALPHA)

7088

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

7089

#endif // defined(ALPHA)

7090

7091

#if defined(BETA)

7092

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7093

7094

#if defined(BROADCAST_BIAS)

7095

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7096

7097

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7098

7099

float8 bias_f0 = convert_float8(bias0);

7100

7101

#ifndef UNIT_BETA

7102

SCALE_BLOCK(1, float, bias_f, BETA);

7103

#endif // UNIT_BIAS

7104

7105

// acc = acc + bias[broadcasted]

7106

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

7107

7108

#else // defined(BROADCAST_BIAS)

7109

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7110

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7111

7112

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7113

7114

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7115

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7116

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7117

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7118

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7119

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7120

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7121

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7122

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7123

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7124

7125

#ifndef UNIT_BETA

7126

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

7131

7132

#endif // defined(BROADCAST_BIAS)

7133

#endif // defined(BETA)

7134

7135

half8 acc_h0 = convert_half8(acc0);

7136

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7137

half8 acc_h1 = convert_half8(acc1);

7138

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7139

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7140

half8 acc_h2 = convert_half8(acc2);

7141

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7142

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7143

half8 acc_h3 = convert_half8(acc3);

7144

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7145

7146

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

7147

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7148

#endif // defined(ACTIVATION_TYPE)

7149

7150

// Store the output block

7151

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7152

}

7153

7154

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

7155

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7156

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

7157

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

7158

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

7159

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

7160

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7161

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

7162

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7163

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7164

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

7165

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7166

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

7167

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7168

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

7169

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

7170

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

7171

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

7172

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7173

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

7174

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7175

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7176

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7177

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7178

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

7179

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

7180

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7181

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7182

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7183

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7184

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7185

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

7186

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

7187

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

7188

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

7189

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

7190

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7191

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

7192

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7193

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7194

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7195

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7196

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7197

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

7198

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7199

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7200

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7201

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

7202

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7203

*/

7204

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

7205

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7206

#if defined(BETA)

7207

IMAGE_DECLARATION(src2),

7208

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7209

IMAGE_DECLARATION(dst),

7210

uint src0_stride_z,

7211

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7212

#if defined(BETA)

7213

uint src2_stride_z,

7214

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7215

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7216

#if defined(REINTERPRET_INPUT_AS_3D)

7217

,

7218

uint src_cross_plane_pad

7219

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7220

#if defined(REINTERPRET_OUTPUT_AS_3D)

7221

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7222

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7223

#endif // REINTERPRET_OUTPUT_AS_3D

7224

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7225

{

7226

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

7227

7228

// Compute starting address for matrix A and Matrix B

7229

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

7230

7231

// Update address for the matrix A

7232

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

7233

7234

// Update address for the matrix B

7235

src_addr.s1 += idx * sizeof(half);

7236

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7237

#if defined(REINTERPRET_INPUT_AS_3D)

7238

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

7239

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7245

// |******************|

7246

// | cross_plane_pad |

7247

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7252

7253

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

7254

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7255

zin = min(DEPTH_GEMM3D - 1, zin);

7256

7257

// Add offset due to the cross plane paddings

7258

zin *= (src_cross_plane_pad * src0_stride_y);

7259

7260

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7261

// multiply src0_stride_z by DEPTH_GEMM3D

7262

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

7263

7264

#else // defined(REINTERPRET_INPUT_AS_3D)

7265

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7266

// Add offset for batched GEMM

7267

src_addr.s0 += get_global_id(2) * src0_stride_z;

7268

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7269

#endif // defined(REINTERPRET_INPUT_AS_3D)

7270

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7271

#if defined(MATRIX_B_DEPTH)

7272

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

7273

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

7274

#else // defined(MATRIX_B_DEPTH)

7275

src_addr.s1 += get_global_id(2) * src1_stride_z;

7276

#endif // defined(MATRIX_B_DEPTH)

7277

7278

half8 acc0 = 0.0h;

7279

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7280

half8 acc1 = 0.0h;

7281

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7282

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7283

half8 acc2 = 0.0h;

7284

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7285

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7286

half8 acc3 = 0.0h;

7287

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7288

7289

int i = 0;

7290

for(; i <= ((int)COLS_A - 4); i += 4)

7291

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7292

#if defined(REINTERPRET_INPUT_AS_3D)

7293

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7294

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

7295

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7296

// Load values from matrix A

7297

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7298

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7299

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7300

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7301

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7302

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7305

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7306

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7307

#endif // defined(REINTERPRET_INPUT_AS_3D)

7308

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7309

// Load values from matrix B

7310

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7311

src_addr.s1 += src1_stride_y;

7312

7313

// Accumulate

7314

acc0 = fma(b0, (half8)a0.s0, acc0);

7315

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7316

acc1 = fma(b0, (half8)a1.s0, acc1);

7317

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7318

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7319

acc2 = fma(b0, (half8)a2.s0, acc2);

7320

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7321

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7322

acc3 = fma(b0, (half8)a3.s0, acc3);

7323

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7324

7325

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7326

src_addr.s1 += src1_stride_y;

7327

acc0 = fma(b0, (half8)a0.s1, acc0);

7328

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7329

acc1 = fma(b0, (half8)a1.s1, acc1);

7330

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7331

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7332

acc2 = fma(b0, (half8)a2.s1, acc2);

7333

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7334

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7335

acc3 = fma(b0, (half8)a3.s1, acc3);

7336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7337

7338

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7339

src_addr.s1 += src1_stride_y;

7340

acc0 = fma(b0, (half8)a0.s2, acc0);

7341

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7342

acc1 = fma(b0, (half8)a1.s2, acc1);

7343

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7344

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7345

acc2 = fma(b0, (half8)a2.s2, acc2);

7346

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7347

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7348

acc3 = fma(b0, (half8)a3.s2, acc3);

7349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7350

7351

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7352

src_addr.s1 += src1_stride_y;

7353

acc0 = fma(b0, (half8)a0.s3, acc0);

7354

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7355

acc1 = fma(b0, (half8)a1.s3, acc1);

7356

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7357

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7358

acc2 = fma(b0, (half8)a2.s3, acc2);

7359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7360

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7361

acc3 = fma(b0, (half8)a3.s3, acc3);

7362

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7363

7364

src_addr.s0 += 4 * sizeof(half);

7365

}

7366

7367

for(; i < (int)COLS_A; ++i)

7368

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7369

#if defined(REINTERPRET_INPUT_AS_3D)

7370

// Load values from matrix A

7371

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7372

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7373

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7374

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7376

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7378

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7379

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7380

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7381

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7382

// Load values from matrix A

7383

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7384

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7385

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7386

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7387

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7388

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7389

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7390

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7391

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7392

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7393

#endif // defined(REINTERPRET_INPUT_AS_3D)

7394

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7395

// Load values from matrix B

7396

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7397

7398

src_addr += (int2)(sizeof(half), src1_stride_y);

7399

7400

// Accumulate

7401

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

7402

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7403

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

7404

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7405

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7406

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

7407

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7408

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7409

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

7410

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7411

}

7412

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7413

int z = get_global_id(2);

7414

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7415

// Compute destination address

7416

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7417

7418

// Compute dst address

7419

__global uchar *dst_addr = offset(&dst, 0, 0);

7420

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7421

uint4 zout = 0;

7422

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7423

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7424

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7425

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7426

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7427

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

7432

// |******************|

7433

// | cross_plane_pad |

7434

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7439

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7440

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7441

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7442

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7443

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7444

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7445

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7446

7447

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7448

// multiply dst_stride_z by DEPTH_GEMM3D

7449

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7450

#else // defined(REINTERPRET_OUTPUT_AS_3D)

7451

// Add offset for batched GEMM

7452

dst_addr += z * dst_stride_z;

7453

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

7454

7455

// Multiply by the weight of matrix-matrix product and store the result

7456

#if defined(ALPHA)

7457

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

7458

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7463

7464

#if defined(BROADCAST_BIAS)

7465

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7466

7467

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7468

7469

#ifndef UNIT_BETA

7470

SCALE_BLOCK(1, half, bias, BETA);

7471

#endif // UNIT_BIAS

7472

7473

// acc = acc + bias[broadcasted]

7474

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

7475

7476

#else // defined(BROADCAST_BIAS)

7477

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7478

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7479

7480

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7481

7482

#ifndef UNIT_BETA

7483

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

7488

7489

#endif // defined(BROADCAST_BIAS)

7490

#endif // defined(BETA)

7491

7492

#if defined(ACTIVATION_TYPE)

Giorgio Arena

2020-10-12 11:53:51 +0100

[diff] [blame]

7493

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7494

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7495

7496

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7497

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7498

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

7499

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7500

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

7501

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7502

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7503

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7504

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7505

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7506

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7507

*

7508

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

7509

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7510

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7511

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7512

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7513

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7514

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7515

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7516

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7517

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7518

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7519

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7520

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7521

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7522

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7523

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7524

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7525

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

7526

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7527

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7528

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7529

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7530

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7531

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7532

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7533

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

7534

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7535

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7536

float4 c = vload4(0, (__global float *)src.ptr);

7537

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7538

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7539

float4 out = alpha_ab + (float4)BETA * c;

7540

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7541

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7542

vstore4(out, 0, (__global float *)dst.ptr);

7543

}

7544

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7545

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7546

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7547

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7548

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7549

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7550

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

7551

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7552

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7553

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7554

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7555

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7556

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7557

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7558

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7559

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7560

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7561

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7562

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7563

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7564

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7565

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7566

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7567

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

7568

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7569

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7570

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7571

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7572

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7573

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7574

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7575

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

7576

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7577

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7578

half8 c = vload8(0, (__global half *)src.ptr);

7579

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7580

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7581

half8 out = alpha_ab + (half8)BETA * c;

7582

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7583

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7584

vstore8(out, 0, (__global half *)dst.ptr);

7585

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7586

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7587

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7588

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7589

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7590

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

7591

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7592

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7593

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7594

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7595

*

7596

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

7597

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7598

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7599

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7600

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7601

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7602

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7603

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7604

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7605

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7606

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7607

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

7608

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

7609

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7610

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7611

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7612

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7613

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7614

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7615

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7616

*/

7617

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

7618

TENSOR3D_DECLARATION(src1),

7619

IMAGE_DECLARATION(dst))

7620

{

7621

int idx = get_global_id(0) * 4;

7622

int idy = get_global_id(1);

7623

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7624

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7625

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

7626

src_addr.s1 += idx * sizeof(float);

7627

7628

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

7632

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7633

{

7634

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

7635

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7636

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

7637

7638

acc += b0 * (float4)a0.s0;

7639

acc += b1 * (float4)a0.s1;

7640

}

7641

7642

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

7643

{

7644

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

7645

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7646

7647

acc += b0 * (float4)a0;

7648

}

7649

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7650

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7651

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7652

7653

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

7654

}

Anton Lokhmotov